GMM - Multivariate - Playground

Gaussian Mixture Models in the Multivariate Case

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.metrics import rand_score, adjusted_rand_score


%matplotlib notebook
plt.rcParams['figure.figsize'] = [8, 8]

Generate Data

Covariance Matrix Sigma must be positive semi-definite.

One way to make sure that we have a positive semi-definite matrix when generating random matrices is:

  • Generate Random Matrix A

  • Multiply A with its transpose

  • The result will be positive semi-definite

# Number of points
N = 1000

# Dimensions
D = 3

# Number of clusters
K = 5
# Proportions
pi = np.random.dirichlet(np.ones(K))
points_in_clusters = np.array(np.round(pi * N), dtype=int)

# Adjust last number for off-by-one errors
points_in_clusters[-1] = N - sum(points_in_clusters[:-1])

# Generate means
means = np.random.uniform(-20, 20, (K, D))

covariances = []
for k in range(K):
    A = np.random.uniform(low=-5, high=5, size=(D, D))
    covariances.append(A @ A.T)

# Identity Covariance Matrix
cov = np.identity(D)

print(f"{pi = }")
print(f"{points_in_clusters = }")
print(f"{means = }")
pi = array([0.3720035 , 0.00803737, 0.07068233, 0.03494731, 0.51432948])
points_in_clusters = array([372,   8,  71,  35, 514])
means = array([[ -4.56199674,   2.35668238, -16.62281791],
       [ -7.80293209,  -8.41854853, -12.52701084],
       [-19.41224242,  19.03505058,  16.44047548],
       [  1.37273957,  -0.48681065,  16.03255314],
       [-19.84040892,   4.4512197 ,  18.05080281]])
X_data = []
for k in range(K):
    n_points = points_in_clusters[k]
    mean = means[k]
    cov = covariances[k]
    X_data.append(np.random.multivariate_normal(mean, cov, n_points))

    
X = X_data[0]
for cluster in X_data[1:]:
    X = np.concatenate((X, cluster))
    
X.shape
(1000, 3)

Helper Functions and EM Algorithm

Multivariate Normal Distribution

def Gaussian(x, mu, cov):
    diff = (x - mu).T
    ans = (np.linalg.det(2 * np.pi * cov) ** -0.5) * np.exp(-0.5 * diff.T @ np.linalg.inv(cov) @ diff)
    
    return ans

Initialize Clusters

Means initialized with K-Means fit

Covariances initialized as Identity Matrices

Proportions all set as equal (1 / K)

def initialize_clusters(X, n_clusters):
    km = KMeans(n_clusters).fit(X)
    means = km.cluster_centers_
    
    covariances = np.array([np.identity(X.shape[1]) for _ in range(n_clusters)])
    
    proportions = [1 / n_clusters for _ in range(n_clusters)]
    
    return proportions, means, covariances
original_means = means
original_pis = pi
original_covs = covariances
pi, means, covs = initialize_clusters(X, K)

Expectation Step

def expectation_step(X, pi, means, covs):
    gamma = np.zeros((N, K), dtype=np.double)
    
    for n in range(N):
        for k in range(K):
            gamma[n][k] = pi[k] * Gaussian(X[n], means[k], covs[k])

            # # Hard-code extremely small or 0 values to be 1e-300
            # if gamma[n][k] < 1e-300:
            #     gamma[n][k] = 1e-300
                
        gamma[n] /= gamma[n].sum()
    
    return gamma

Maximization Step

def maximization_step(X, gamma, pi, means, covs):
    optimal_pi = pi
    optimal_means = means
    optimal_covs = covs

    N_k = gamma.sum(axis=0)
    for k in range(K):
        optimal_pi[k] = N_k[k] / N
        
        gamma_k = gamma[:, k].reshape(-1, 1)
        
        optimal_means[k] = np.sum(gamma_k * X, axis=0) / N_k[k]
        optimal_covs[k] = (gamma_k * (X - optimal_means[k])).T @ (X - optimal_means[k]) / N_k[k]
        
    
    return optimal_pi, optimal_means, optimal_covs

Log Likelihood

def log_likelihood(X, pi, means, covs):
    temp = 0
    temparray = []
    for n in range(N):
        temp = np.array(sum([pi[k] * Gaussian(X[n], means[k], covs[k]) for k in range(K)]))
        
        # # Hard-code extremely small or 0 values to 1e-300
        # if temp < 1e-300:
        #     temp = 1e-300
            
        temparray.append(np.log(temp))    
    return sum(temparray)
MAX_ITERS = 100
pi, means, covs = initialize_clusters(X, K)
convergence_limit = 0.01

old_likelihood = log_likelihood(X, pi, means, covs)
for i in range(MAX_ITERS):
    likelihood = log_likelihood(X, pi, means, covs)
    gamma = expectation_step(X, pi, means, covs)
    pi, means, covs = maximization_step(X, gamma, pi, means, covs)
    
    print(f"Iteration {i+1} | Log Likelihood = {likelihood}")
    
    if abs(likelihood - old_likelihood) < convergence_limit:
        print("Converged.")
        break
Iteration 1 | Log Likelihood = -31267.620534990634
Converged.

Results

We look at the Adjusted Rand Score/Index. Refer the Wikipedia Page on Rand index for details.

predictions = [np.argmax(g) for g in gamma]

x, y, z = X.T

data = {
    'x': x,
    'y': y,
    'z': z,
    'pred': predictions
}

df = pd.DataFrame(data)
# Compute K-Means and GMM Adjusted Rand Score

original_classes = []
for i in range(len(X_data)):
    for point in X_data[i]:
        original_classes.append(i)
        
predicted_classes = []
for g in gamma:
    predicted_classes.append(np.argmax(g))
    
km = KMeans(K).fit(X)
kmeans_predicted_classes = km.fit_predict(X)
df['kmeans_pred'] = kmeans_predicted_classes

gmm_ars = adjusted_rand_score(original_classes, predicted_classes)
kmeans_ars = adjusted_rand_score(original_classes, kmeans_predicted_classes)

print(f"GMM ARS = {gmm_ars}")
print(f"K-Means ARS = {kmeans_ars}")
GMM ARS = 0.3896964516426902
K-Means ARS = 0.38871693142431624

Visualize Data and Predictions

df['original_class'] = original_classes
px.scatter_3d(df, x='x', y='y', z='z', 
              color=df['original_class'].astype(str), opacity=0.7, width=1000, height=800,
              title="Original",
              color_discrete_sequence=px.colors.qualitative.D3)
px.scatter_3d(df, x='x', y='y', z='z', 
              color=df['pred'].astype('str'), opacity=0.7, width=800, height=800,
              title="GMM Predictions",
              color_discrete_sequence=px.colors.qualitative.D3)
px.scatter_3d(df, x='x', y='y', z='z', 
              color=df['kmeans_pred'].astype('str'), opacity=0.7, width=800, height=800,
              title="K-Means Predictions",
              color_discrete_sequence=px.colors.qualitative.D3)