Skip to content

Commit

Permalink
Math clarifications
Browse files Browse the repository at this point in the history
  • Loading branch information
shntnu committed Jan 9, 2024
1 parent 848cd18 commit bff48c9
Showing 1 changed file with 65 additions and 11 deletions.
76 changes: 65 additions & 11 deletions pycytominer/operations/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@ def __init__(self, epsilon=1e-6, center=True, method="ZCA", return_numpy=False):
)
self.method = method

# PCA-cor and ZCA-cor require center=True
# PCA-cor and ZCA-cor require center=True because we assumed we are
# only ever interested in computing centered Pearson correlation
# https://stackoverflow.com/questions/23891391/uncentered-pearson-correlation

if self.method in ["PCA-cor", "ZCA-cor"] and not self.center:
raise ValueError("PCA-cor and ZCA-cor require center=True")

Expand Down Expand Up @@ -103,20 +106,27 @@ def fit(self, X, y=None):
# Get the number of observations and variables
n, d = X.shape

# compute the rank of the matrix X
# Checking the rank of matrix X considering the number of samples (n) and features (d).
r = np.linalg.matrix_rank(X)

# If n < d, then rank should be equal to n - 1 (if centered) or n (if not centered)
# If n >= d, then rank should be equal to d
# Case 1: More features than samples (n < d).
# If centered (mean of each feature subtracted), one dimension becomes dependent, reducing rank to n - 1.
# If not centered, the max rank is limited by n, as there can't be more independent vectors than samples.
# Case 2: More samples than features or equal (n >= d).
# Here, the max rank is limited by d (number of features), assuming each provides unique information.

if not ((r == d) | (self.center & (r == n - 1)) | (not self.center & (r == n))):
raise ValueError(
(
"Sphering is not supported when the data matrix X is not full rank."
"The data matrix X is not full rank: n = {n}, d = {d}, r = {r}."
"Perfect linear dependencies are unusual in data matrices so something seems amiss."
"Check for linear dependencies in the data and remove them."
)
)

# Get the eigenvalues and eigenvectors of the covariance matrix using SVD
# Get the eigenvalues and eigenvectors of the covariance matrix
# by computing the SVD on the data matrix
# https://stats.stackexchange.com/q/134282/8494
_, Sigma, Vt = np.linalg.svd(X, full_matrices=True)

# if n <= d then Sigma has shape (n,) so it will need to be expanded to
Expand All @@ -130,19 +140,63 @@ def fit(self, X, y=None):
)
raise ValueError(f"{error_detail}. This is likely a bug in {context}.")

if r != n - 1:
error_detail = (
f"When n <= d, the rank should be n - 1 i.e. {n - 1} but it is {r}"
)
if (r != n - 1) & (r != n):
error_detail = f"When n <= d, the rank should be n - 1 or n i.e. {n - 1} or {n} but it is {r}"
context = (
"the call to `np.linalg.svd` in `pycytominer.transform.Spherize`"
)
raise ValueError(f"{error_detail}. This is likely a bug in {context}.")
raise ValueError(f"{error_detail}. This is likely a bug in {context}.")

Sigma = np.concatenate((Sigma[0:r], np.repeat(Sigma[r - 1], d - r)))

Sigma = Sigma + self.epsilon

# From https://arxiv.org/abs/1512.00809, the PCA whitening matrix is
#
# W = Lambda^(-1/2) E^T
#
# where
# - E is the eigenvector matrix
# - Lambda is the (diagonal) eigenvalue matrix
# of cov(X), where X is the data matrix
# (i.e., cov(X) = E Lambda E^T)
#
# However, W can also be computed using the Singular Value Decomposition
# (SVD) of X. Assuming X is mean-centered (zero mean for each feature),
# the SVD of X is given by:
#
# X = U S V^T
#
# The covariance matrix of X can be expressed using its SVD:
#
# cov(X) = (X^T X) / (n - 1)
# = (V S^2 V^T) / (n - 1)
#
# By comparing cov(X) = E * Lambda * E^T with the SVD form, we identify:
#
# E = V (eigenvectors)
# Lambda = S^2 / (n - 1) (eigenvalues)
#
# Thus, Lambda^(-1/2) can be expressed as:
#
# Lambda^(-1/2) = inv(S) * sqrt(n - 1)
#
# Therefore, the PCA Whitening matrix W becomes:
#
# W = Lambda^(-1/2) * E^T
# = (inv(S) * sqrt(n - 1)) * V^T
# = (inv(S) * V^T) * sqrt(n - 1)
#
# In NumPy, this is implemented as:
#
# W = (np.linalg.inv(S) @ Vt) * np.sqrt(n - 1)
#
# Alternatively, this can be expressed as:
#
# W = (Vt / Sigma[:, np.newaxis]) * np.sqrt(n - 1)
#
# where Sigma contains the diagonal elements of S (singular values).

self.W = (Vt / Sigma[:, np.newaxis]).transpose() * np.sqrt(n - 1)

# If ZCA, perform additional rotation
Expand Down

0 comments on commit bff48c9

Please sign in to comment.