GMM - Multivariate - Playground
Contents
GMM - Multivariate - Playground¶
Gaussian Mixture Models in the Multivariate Case
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.metrics import rand_score, adjusted_rand_score
%matplotlib notebook
plt.rcParams['figure.figsize'] = [8, 8]
Generate Data¶
Covariance Matrix Sigma must be positive semi-definite.
One way to make sure that we have a positive semi-definite matrix when generating random matrices is:
Generate Random Matrix A
Multiply A with its transpose
The result will be positive semi-definite
# Number of points
N = 1000
# Dimensions
D = 3
# Number of clusters
K = 5
# Proportions
pi = np.random.dirichlet(np.ones(K))
points_in_clusters = np.array(np.round(pi * N), dtype=int)
# Adjust last number for off-by-one errors
points_in_clusters[-1] = N - sum(points_in_clusters[:-1])
# Generate means
means = np.random.uniform(-20, 20, (K, D))
covariances = []
for k in range(K):
A = np.random.uniform(low=-5, high=5, size=(D, D))
covariances.append(A @ A.T)
# Identity Covariance Matrix
cov = np.identity(D)
print(f"{pi = }")
print(f"{points_in_clusters = }")
print(f"{means = }")
pi = array([0.3720035 , 0.00803737, 0.07068233, 0.03494731, 0.51432948])
points_in_clusters = array([372, 8, 71, 35, 514])
means = array([[ -4.56199674, 2.35668238, -16.62281791],
[ -7.80293209, -8.41854853, -12.52701084],
[-19.41224242, 19.03505058, 16.44047548],
[ 1.37273957, -0.48681065, 16.03255314],
[-19.84040892, 4.4512197 , 18.05080281]])
X_data = []
for k in range(K):
n_points = points_in_clusters[k]
mean = means[k]
cov = covariances[k]
X_data.append(np.random.multivariate_normal(mean, cov, n_points))
X = X_data[0]
for cluster in X_data[1:]:
X = np.concatenate((X, cluster))
X.shape
(1000, 3)
Helper Functions and EM Algorithm¶
Multivariate Normal Distribution¶
def Gaussian(x, mu, cov):
diff = (x - mu).T
ans = (np.linalg.det(2 * np.pi * cov) ** -0.5) * np.exp(-0.5 * diff.T @ np.linalg.inv(cov) @ diff)
return ans
Initialize Clusters¶
Means initialized with K-Means fit
Covariances initialized as Identity Matrices
Proportions all set as equal (1 / K)
def initialize_clusters(X, n_clusters):
km = KMeans(n_clusters).fit(X)
means = km.cluster_centers_
covariances = np.array([np.identity(X.shape[1]) for _ in range(n_clusters)])
proportions = [1 / n_clusters for _ in range(n_clusters)]
return proportions, means, covariances
original_means = means
original_pis = pi
original_covs = covariances
pi, means, covs = initialize_clusters(X, K)
Expectation Step¶
def expectation_step(X, pi, means, covs):
gamma = np.zeros((N, K), dtype=np.double)
for n in range(N):
for k in range(K):
gamma[n][k] = pi[k] * Gaussian(X[n], means[k], covs[k])
# # Hard-code extremely small or 0 values to be 1e-300
# if gamma[n][k] < 1e-300:
# gamma[n][k] = 1e-300
gamma[n] /= gamma[n].sum()
return gamma
Maximization Step¶
def maximization_step(X, gamma, pi, means, covs):
optimal_pi = pi
optimal_means = means
optimal_covs = covs
N_k = gamma.sum(axis=0)
for k in range(K):
optimal_pi[k] = N_k[k] / N
gamma_k = gamma[:, k].reshape(-1, 1)
optimal_means[k] = np.sum(gamma_k * X, axis=0) / N_k[k]
optimal_covs[k] = (gamma_k * (X - optimal_means[k])).T @ (X - optimal_means[k]) / N_k[k]
return optimal_pi, optimal_means, optimal_covs
Log Likelihood¶
def log_likelihood(X, pi, means, covs):
temp = 0
temparray = []
for n in range(N):
temp = np.array(sum([pi[k] * Gaussian(X[n], means[k], covs[k]) for k in range(K)]))
# # Hard-code extremely small or 0 values to 1e-300
# if temp < 1e-300:
# temp = 1e-300
temparray.append(np.log(temp))
return sum(temparray)
MAX_ITERS = 100
pi, means, covs = initialize_clusters(X, K)
convergence_limit = 0.01
old_likelihood = log_likelihood(X, pi, means, covs)
for i in range(MAX_ITERS):
likelihood = log_likelihood(X, pi, means, covs)
gamma = expectation_step(X, pi, means, covs)
pi, means, covs = maximization_step(X, gamma, pi, means, covs)
print(f"Iteration {i+1} | Log Likelihood = {likelihood}")
if abs(likelihood - old_likelihood) < convergence_limit:
print("Converged.")
break
Iteration 1 | Log Likelihood = -31267.620534990634
Converged.
Results¶
We look at the Adjusted Rand Score/Index. Refer the Wikipedia Page on Rand index for details.
predictions = [np.argmax(g) for g in gamma]
x, y, z = X.T
data = {
'x': x,
'y': y,
'z': z,
'pred': predictions
}
df = pd.DataFrame(data)
# Compute K-Means and GMM Adjusted Rand Score
original_classes = []
for i in range(len(X_data)):
for point in X_data[i]:
original_classes.append(i)
predicted_classes = []
for g in gamma:
predicted_classes.append(np.argmax(g))
km = KMeans(K).fit(X)
kmeans_predicted_classes = km.fit_predict(X)
df['kmeans_pred'] = kmeans_predicted_classes
gmm_ars = adjusted_rand_score(original_classes, predicted_classes)
kmeans_ars = adjusted_rand_score(original_classes, kmeans_predicted_classes)
print(f"GMM ARS = {gmm_ars}")
print(f"K-Means ARS = {kmeans_ars}")
GMM ARS = 0.3896964516426902
K-Means ARS = 0.38871693142431624
Visualize Data and Predictions¶
df['original_class'] = original_classes
px.scatter_3d(df, x='x', y='y', z='z',
color=df['original_class'].astype(str), opacity=0.7, width=1000, height=800,
title="Original",
color_discrete_sequence=px.colors.qualitative.D3)
px.scatter_3d(df, x='x', y='y', z='z',
color=df['pred'].astype('str'), opacity=0.7, width=800, height=800,
title="GMM Predictions",
color_discrete_sequence=px.colors.qualitative.D3)
px.scatter_3d(df, x='x', y='y', z='z',
color=df['kmeans_pred'].astype('str'), opacity=0.7, width=800, height=800,
title="K-Means Predictions",
color_discrete_sequence=px.colors.qualitative.D3)