GMM - Multivariate - Playground¶

Gaussian Mixture Models in the Multivariate Case

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.metrics import rand_score, adjusted_rand_score


%matplotlib notebook
plt.rcParams['figure.figsize'] = [8, 8]

Generate Data¶

Covariance Matrix Sigma must be positive semi-definite.

One way to make sure that we have a positive semi-definite matrix when generating random matrices is:

Generate Random Matrix A
Multiply A with its transpose
The result will be positive semi-definite

# Number of points
N = 1000

# Dimensions
D = 3

# Number of clusters
K = 5

# Proportions
pi = np.random.dirichlet(np.ones(K))
points_in_clusters = np.array(np.round(pi * N), dtype=int)

# Adjust last number for off-by-one errors
points_in_clusters[-1] = N - sum(points_in_clusters[:-1])

# Generate means
means = np.random.uniform(-20, 20, (K, D))

covariances = []
for k in range(K):
    A = np.random.uniform(low=-5, high=5, size=(D, D))
    covariances.append(A @ A.T)

# Identity Covariance Matrix
cov = np.identity(D)

print(f"{pi = }")
print(f"{points_in_clusters = }")
print(f"{means = }")

pi = array([0.3720035 , 0.00803737, 0.07068233, 0.03494731, 0.51432948])
points_in_clusters = array([372,   8,  71,  35, 514])
means = array([[ -4.56199674,   2.35668238, -16.62281791],
       [ -7.80293209,  -8.41854853, -12.52701084],
       [-19.41224242,  19.03505058,  16.44047548],
       [  1.37273957,  -0.48681065,  16.03255314],
       [-19.84040892,   4.4512197 ,  18.05080281]])

X_data = []
for k in range(K):
    n_points = points_in_clusters[k]
    mean = means[k]
    cov = covariances[k]
    X_data.append(np.random.multivariate_normal(mean, cov, n_points))

    
X = X_data[0]
for cluster in X_data[1:]:
    X = np.concatenate((X, cluster))
    
X.shape

(1000, 3)

Helper Functions and EM Algorithm¶

Multivariate Normal Distribution¶

def Gaussian(x, mu, cov):
    diff = (x - mu).T
    ans = (np.linalg.det(2 * np.pi * cov) ** -0.5) * np.exp(-0.5 * diff.T @ np.linalg.inv(cov) @ diff)
    
    return ans

Initialize Clusters¶

Means initialized with K-Means fit

Covariances initialized as Identity Matrices

Proportions all set as equal (1 / K)

def initialize_clusters(X, n_clusters):
    km = KMeans(n_clusters).fit(X)
    means = km.cluster_centers_
    
    covariances = np.array([np.identity(X.shape[1]) for _ in range(n_clusters)])
    
    proportions = [1 / n_clusters for _ in range(n_clusters)]
    
    return proportions, means, covariances

original_means = means
original_pis = pi
original_covs = covariances

pi, means, covs = initialize_clusters(X, K)

Expectation Step¶

def expectation_step(X, pi, means, covs):
    gamma = np.zeros((N, K), dtype=np.double)
    
    for n in range(N):
        for k in range(K):
            gamma[n][k] = pi[k] * Gaussian(X[n], means[k], covs[k])

            # # Hard-code extremely small or 0 values to be 1e-300
            # if gamma[n][k] < 1e-300:
            #     gamma[n][k] = 1e-300
                
        gamma[n] /= gamma[n].sum()
    
    return gamma

Maximization Step¶

def maximization_step(X, gamma, pi, means, covs):
    optimal_pi = pi
    optimal_means = means
    optimal_covs = covs

    N_k = gamma.sum(axis=0)
    for k in range(K):
        optimal_pi[k] = N_k[k] / N
        
        gamma_k = gamma[:, k].reshape(-1, 1)
        
        optimal_means[k] = np.sum(gamma_k * X, axis=0) / N_k[k]
        optimal_covs[k] = (gamma_k * (X - optimal_means[k])).T @ (X - optimal_means[k]) / N_k[k]
        
    
    return optimal_pi, optimal_means, optimal_covs

Log Likelihood¶

def log_likelihood(X, pi, means, covs):
    temp = 0
    temparray = []
    for n in range(N):
        temp = np.array(sum([pi[k] * Gaussian(X[n], means[k], covs[k]) for k in range(K)]))
        
        # # Hard-code extremely small or 0 values to 1e-300
        # if temp < 1e-300:
        #     temp = 1e-300
            
        temparray.append(np.log(temp))    
    return sum(temparray)

MAX_ITERS = 100
pi, means, covs = initialize_clusters(X, K)
convergence_limit = 0.01

old_likelihood = log_likelihood(X, pi, means, covs)
for i in range(MAX_ITERS):
    likelihood = log_likelihood(X, pi, means, covs)
    gamma = expectation_step(X, pi, means, covs)
    pi, means, covs = maximization_step(X, gamma, pi, means, covs)
    
    print(f"Iteration {i+1} | Log Likelihood = {likelihood}")
    
    if abs(likelihood - old_likelihood) < convergence_limit:
        print("Converged.")
        break

Iteration 1 | Log Likelihood = -31267.620534990634
Converged.

Results¶

We look at the Adjusted Rand Score/Index. Refer the Wikipedia Page on Rand index for details.

predictions = [np.argmax(g) for g in gamma]

x, y, z = X.T

data = {
    'x': x,
    'y': y,
    'z': z,
    'pred': predictions
}

df = pd.DataFrame(data)

# Compute K-Means and GMM Adjusted Rand Score

original_classes = []
for i in range(len(X_data)):
    for point in X_data[i]:
        original_classes.append(i)
        
predicted_classes = []
for g in gamma:
    predicted_classes.append(np.argmax(g))
    
km = KMeans(K).fit(X)
kmeans_predicted_classes = km.fit_predict(X)
df['kmeans_pred'] = kmeans_predicted_classes

gmm_ars = adjusted_rand_score(original_classes, predicted_classes)
kmeans_ars = adjusted_rand_score(original_classes, kmeans_predicted_classes)

print(f"GMM ARS = {gmm_ars}")
print(f"K-Means ARS = {kmeans_ars}")

GMM ARS = 0.3896964516426902
K-Means ARS = 0.38871693142431624

Visualize Data and Predictions¶

df['original_class'] = original_classes
px.scatter_3d(df, x='x', y='y', z='z', 
              color=df['original_class'].astype(str), opacity=0.7, width=1000, height=800,
              title="Original",
              color_discrete_sequence=px.colors.qualitative.D3)

px.scatter_3d(df, x='x', y='y', z='z', 
              color=df['pred'].astype('str'), opacity=0.7, width=800, height=800,
              title="GMM Predictions",
              color_discrete_sequence=px.colors.qualitative.D3)

px.scatter_3d(df, x='x', y='y', z='z', 
              color=df['kmeans_pred'].astype('str'), opacity=0.7, width=800, height=800,
              title="K-Means Predictions",
              color_discrete_sequence=px.colors.qualitative.D3)

Gaussian Mixture Models

GMM - Multivariate - Playground

Contents