Install all required dependencies
%pip install git+https://github.com/BonnerLab/ccn-tutorial.git
Using principal component analysis to reveal latent structure in data
August 26, 2023
March 10, 2024
Here’s a link to this notebook on Google Colab.
Let’s perform an imaginary neuroscience experiment! We’ll record voltages from P = 2 neurons in visual cortex while the participant passively views N = 1000 dots of different colors and sizes.
from collections.abc import Sequence, Callable
import warnings
from typing import NamedTuple
import numpy as np
import pandas as pd
import xarray as xr
from scipy.spatial.distance import pdist, squareform
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.figure import Figure
import matplotlib.ticker as ticker
from matplotlib.animation import FuncAnimation
from matplotlib_inline.backend_inline import set_matplotlib_formats
from IPython.display import display, HTML
%matplotlib inline
sns.set_theme(
context="notebook",
style="white",
palette="deep",
rc={"legend.edgecolor": "None"},
)
set_matplotlib_formats("svg")
pd.set_option("display.max_rows", 5)
pd.set_option("display.max_columns", 10)
pd.set_option("display.precision", 3)
pd.set_option("display.show_dimensions", False)
xr.set_options(display_max_rows=3, display_expand_data=False)
warnings.filterwarnings("ignore")
Let’s create N = 1000 dots of different colors and sizes. From the scatterplot, we can see that the two latent variables are uncorrelated.
def view_stimuli(data: pd.DataFrame) -> Figure:
fig, ax = plt.subplots()
sns.scatterplot(
ax=ax,
data=data,
x="color",
y="size",
hue="color",
size="size",
palette="flare",
legend=False,
)
sns.despine(ax=ax, trim=True)
fig.tight_layout()
plt.close(fig)
return fig
stimuli = create_stimuli(n=1_000, rng=rng)
view_stimuli(stimuli)
Now, let’s simulate some neural data. We need to decide how the P = 2 neurons might respond to these N = 1000 stimulus dots. Each neuron could respond to either one or both of the latent features that define these stimuli – \text{color} and \text{size}. The neuron’s responses could also be subject to noise.
Here, we model each neuron’s response r_\text{neuron} as a simple linear combination of the two latent features with stimulus-independent Gaussian noise \epsilon:
r_{\text{neuron}} \sim \beta_{\text{color}} \left( \text{color} \right) + \beta_{\text{size}} \left( \text{size} \right) + \epsilon, where \epsilon \sim \mathcal{N}(\mu_{\text{neuron}}, \sigma_{\text{neuron}}^2)
As we can see, each neuron’s response is completely defined by exactly four parameters:
def simulate_neuron_responses(
stimuli: pd.DataFrame,
neuron: Neuron,
*,
rng: np.random.Generator,
) -> np.ndarray:
def z_score(x: np.ndarray) -> np.ndarray:
return (x - x.mean()) / x.std()
return (
neuron.beta_color * z_score(stimuli["color"])
+ neuron.beta_size * z_score(stimuli["size"])
+ neuron.std * rng.standard_normal(size=(len(stimuli),))
+ neuron.mean
)
def simulate_multiple_neuron_responses(
*,
stimuli: pd.DataFrame,
neurons: Sequence[Neuron],
rng: np.random.Generator,
) -> xr.DataArray:
data = []
for i_neuron, neuron in enumerate(neurons):
data.append(
xr.DataArray(
data=simulate_neuron_responses(
stimuli=stimuli,
neuron=neuron,
rng=rng,
),
dims=("stimulus",),
coords={
column: ("stimulus", values)
for column, values in stimuli.reset_index(names="stimulus").items()
},
)
.expand_dims({"neuron": [i_neuron + 1]})
.assign_coords(
{
field: ("neuron", [float(value)])
for field, value in neuron._asdict().items()
}
)
)
return (
xr.concat(data, dim="neuron")
.rename("neuron responses")
.transpose("stimulus", "neuron")
)
This procedure produces a data matrix X \in \mathbb{R}^{N \times P} containing the P = 2 neurons’ responses to the N = 1000 stimuli.
<xarray.DataArray 'neuron responses' (stimulus: 1000, neuron: 2)> Size: 16kB 12.97 -15.39 1.005 3.224 1.226 2.431 ... 6.028 -6.32 5.488 -16.32 6.215 -9.989 Coordinates: (3/8) * neuron (neuron) int64 16B 1 2 * stimulus (stimulus) int64 8kB 1 2 3 4 5 6 7 ... 995 996 997 998 999 1000 ... ... std (neuron) float64 16B 1.0 3.0
array([[ 12.96533678, -15.38921968], [ 1.00523852, 3.22448895], [ 1.2258057 , 2.43055775], ..., [ 6.02810356, -6.32031526], [ 5.48782478, -16.32351293], [ 6.21490603, -9.98853112]])
array([1, 2])
array([ 1, 2, 3, ..., 998, 999, 1000])
array([6.36961687e-01, 2.69786714e-01, 4.09735239e-02, 1.65276355e-02, 8.13270239e-01, 9.12755577e-01, 6.06635776e-01, 7.29496561e-01, 5.43624991e-01, 9.35072424e-01, 8.15853554e-01, 2.73850017e-03, 8.57404277e-01, 3.35855753e-02, 7.29655446e-01, 1.75655621e-01, 8.63178922e-01, 5.41461220e-01, 2.99711891e-01, 4.22687221e-01, 2.83196711e-02, 1.24283276e-01, 6.70624415e-01, 6.47189512e-01, 6.15385111e-01, 3.83677554e-01, 9.97209936e-01, 9.80835339e-01, 6.85541984e-01, 6.50459276e-01, 6.88446731e-01, 3.88921424e-01, 1.35096505e-01, 7.21488340e-01, 5.25354322e-01, 3.10241876e-01, 4.85835359e-01, 8.89487834e-01, 9.34043516e-01, 3.57795197e-01, 5.71529831e-01, 3.21869391e-01, 5.94300030e-01, 3.37911226e-01, 3.91619001e-01, 8.90274352e-01, 2.27157594e-01, 6.23187145e-01, 8.40153436e-02, 8.32644148e-01, 7.87098307e-01, 2.39369443e-01, 8.76484231e-01, 5.85680348e-02, 3.36117061e-01, 1.50279467e-01, 4.50339367e-01, 7.96324270e-01, 2.30642209e-01, 5.20213011e-02, 4.04551840e-01, 1.98513045e-01, 9.07530456e-02, 5.80332386e-01, 2.98696133e-01, 6.71994878e-01, 1.99515444e-01, 9.42113111e-01, 3.65110168e-01, 1.05495280e-01, 6.29108152e-01, 9.27154553e-01, 4.40377155e-01, 9.54590494e-01, 4.99895814e-01, 4.25228625e-01, 6.20213452e-01, 9.95096505e-01, 9.48943675e-01, 4.60045139e-01, ... 3.39811155e-01, 2.21699710e-04, 4.82537622e-01, 6.08000665e-01, 9.29904601e-02, 2.42094402e-01, 8.03991821e-01, 8.40281560e-01, 3.87733254e-01, 8.14223731e-01, 2.77140253e-01, 7.06108222e-01, 5.45456624e-01, 4.40099079e-01, 6.56442276e-01, 1.33906791e-02, 1.62443443e-01, 2.93823464e-01, 6.80562610e-01, 7.06235313e-01, 6.80760824e-01, 7.67617107e-01, 7.95515610e-02, 1.05888908e-01, 8.55351160e-01, 3.56837753e-01, 5.68371290e-01, 5.03502806e-01, 6.26662571e-01, 7.69467112e-02, 7.69790226e-01, 1.23402328e-01, 6.81374462e-01, 4.02142099e-01, 4.92261636e-01, 6.71693734e-01, 3.71002751e-01, 4.60374714e-02, 9.64211552e-01, 5.22676899e-01, 7.42144641e-01, 5.31294834e-01, 8.19686903e-01, 5.64616179e-01, 1.22756878e-01, 6.41906565e-01, 1.72740669e-01, 8.23654135e-01, 6.81061600e-01, 9.39808638e-01, 6.29080790e-01, 2.25163097e-01, 5.57137546e-01, 7.71772277e-01, 7.11888298e-01, 3.42296706e-01, 6.55351151e-01, 9.35269061e-01, 6.84810043e-01, 3.67301402e-01, 9.10758330e-01, 8.27624193e-01, 8.55183760e-01, 1.06841377e-01, 2.90828834e-01, 7.90127803e-01, 2.74807496e-01, 7.37059356e-02, 6.83266012e-01, 7.99269956e-01, 6.41767808e-01, 3.44843336e-01, 5.59773319e-01, 2.15199514e-02, 5.62661656e-01, 8.56801175e-01, 7.80532350e-02, 3.83319397e-01, 1.64863718e-01, 3.80007897e-01])
array([0.01300767, 0.82776292, 0.49624331, 0.43591807, 0.60179472, 0.8500282 , 0.29126072, 0.26751697, 0.04949421, 0.26639909, 0.06621185, 0.04155849, 0.55273056, 0.18383489, 0.07425772, 0.91671474, 0.14873389, 0.09483079, 0.97067806, 0.66696696, 0.72575404, 0.563204 , 0.07038971, 0.84187724, 0.418029 , 0.39246782, 0.13530924, 0.11321989, 0.52224593, 0.56874359, 0.51868554, 0.61312468, 0.87764346, 0.50420484, 0.37914768, 0.2565727 , 0.30684664, 0.56080705, 0.79537234, 0.44112103, 0.04076233, 0.1881545 , 0.09065223, 0.33334341, 0.68437666, 0.59071473, 0.66212759, 0.45459513, 0.10978054, 0.29625596, 0.51096048, 0.49716497, 0.24366139, 0.82530156, 0.43331334, 0.84545613, 0.26549263, 0.94193959, 0.11185735, 0.76918249, 0.02018645, 0.23632066, 0.8705533 , 0.350105 , 0.93247949, 0.929417 , 0.80019258, 0.39610545, 0.8582685 , 0.45710434, 0.1261722 , 0.85195837, 0.8162467 , 0.13556544, 0.86652652, 0.51896305, 0.74359076, 0.26817602, 0.21546148, 0.84831281, 0.6002138 , 0.14770547, 0.36587009, 0.85903582, 0.46828358, 0.3368529 , 0.34095386, 0.8246442 , 0.45429903, 0.9483535 , 0.31220015, 0.75648033, 0.28570549, 0.7678388 , 0.01759798, 0.12982098, 0.25925691, 0.87009196, 0.32249838, 0.48352554, ... 0.25857775, 0.09677339, 0.18039325, 0.25448073, 0.83934999, 0.22122572, 0.82839782, 0.74329961, 0.97429452, 0.75359014, 0.11511305, 0.94004275, 0.84209462, 0.4436286 , 0.43358894, 0.03095428, 0.21764154, 0.71488612, 0.1105268 , 0.99174489, 0.02168247, 0.99097346, 0.29620757, 0.46086521, 0.54547145, 0.28955319, 0.22049757, 0.0505692 , 0.823433 , 0.97230937, 0.12555748, 0.85983084, 0.72053945, 0.75022534, 0.38780546, 0.05757491, 0.69042312, 0.96212643, 0.65875825, 0.23405231, 0.53673526, 0.12167514, 0.71787855, 0.67286419, 0.446794 , 0.00866449, 0.06036607, 0.68197769, 0.69241334, 0.57520933, 0.22769122, 0.66313159, 0.1055052 , 0.62768057, 0.57231791, 0.35549364, 0.21916604, 0.7245827 , 0.18099196, 0.15735466, 0.63276412, 0.66062453, 0.1019691 , 0.26228831, 0.09859603, 0.91383146, 0.00838652, 0.35101158, 0.15682101, 0.46699488, 0.90683736, 0.70737638, 0.36017375, 0.1866537 , 0.70501213, 0.54191587, 0.72028997, 0.04485529, 0.17314062, 0.3198733 , 0.4665367 , 0.56944602, 0.56209399, 0.54291566, 0.56611634, 0.41747562, 0.27881285, 0.51812954, 0.12186051, 0.74927137, 0.95293572, 0.05409649, 0.78232714, 0.22383289, 0.33641135, 0.03353818, 0.9690858 , 0.56209561, 0.08158143, 0.32155563])
array([ 3., -2.])
array([-2., 5.])
array([ 7., -6.])
array([1., 3.])
PandasIndex(Index([1, 2], dtype='int64', name='neuron'))
PandasIndex(Index([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000], dtype='int64', name='stimulus', length=1000))
In this toy example, we know the ground truth of the neurons’ responses – how exactly it depends on the stimulus features. Unfortunately, in real experiments, we don’t have this luxury and need to derive the neural coding strategy from the data. Let’s now pretend that this toy example is real data, where we don’t know the ground truth.
How is information about the stimulus encoded in the population activity? Is there a neuron that is sensitive to color and another that is sensitive to size?
One way to understand this is by studying the neurons directly. Let’s start by visualizing the response of each neuron to our stimuli.
def view_individual_scatter(
data: xr.DataArray,
*,
coord: str,
dim: str,
template_func: Callable[[int], str],
) -> Figure:
rng = np.random.default_rng()
data_ = data.assign_coords(
{"arbitrary": ("stimulus", rng.random(data.sizes["stimulus"]))}
)
min_, max_ = data_.min(), data_.max()
n_features = data.sizes[dim]
fig, axes = plt.subplots(nrows=n_features, figsize=(7, 2 * n_features))
for index, ax in zip(data[coord].values, axes.flat):
label = template_func(index)
sns.scatterplot(
ax=ax,
data=(
data_.isel({dim: data[coord].values == index})
.rename(label)
.to_dataframe()
),
x=label,
y="arbitrary",
hue="color",
size="size",
palette="flare",
legend=False,
)
sns.despine(ax=ax, left=True, offset=10)
ax.set_xlim([min_, max_])
ax.get_yaxis().set_visible(False)
fig.tight_layout(h_pad=3)
plt.close(fig)
return fig
By visualizing the responses, we can see that each neuron is tuned to both color and size.
We can also use methods such as Representational Similarity Analysis (Kriegeskorte 2008) to study the information content of the two neurons. In RSA, we compute the pairwise dissimilarities between the representations of the stimuli and organize them into a representational dissimilarity matrix, or RDM.
The entries of an RDM indicate the degree to which each pair of stimuli is distinguished by the neurons and the RDM as a whole measures the overall population geometry of the system.
def display_rdm(data: xr.DataArray) -> np.ndarray:
data_ = data.copy()
data_["color bin"] = (
"stimulus",
np.digitize(data_["color"].values, bins=np.linspace(0, 1, 10)),
)
data_["size bin"] = (
"stimulus",
np.digitize(data_["size"].values, bins=np.linspace(0, 1, 10)),
)
data_ = data_.sortby(["color bin", "size bin"])
rdm = squareform(pdist(data_.values))
fig, axes = plt.subplots(
figsize=(8, 8),
nrows=2,
ncols=2,
height_ratios=[1, 20],
width_ratios=[1, 20],
sharex=True,
sharey=True,
)
axes[0, 1].scatter(
np.arange(data_.sizes["stimulus"]),
np.zeros(data_.sizes["stimulus"]),
c=data_["color"].values,
s=data_["size"].values * 100,
)
axes[0, 1].axis("off")
axes[1, 0].scatter(
np.zeros(data_.sizes["stimulus"]),
np.arange(data_.sizes["stimulus"]),
c=data_["color"].values,
s=data_["size"].values * 100,
)
axes[1, 1].imshow(rdm, cmap="viridis")
axes[1, 1].set_aspect("equal", "box")
for ax in axes.flat:
ax.axis("off")
fig.tight_layout(w_pad=0, h_pad=0)
plt.close(fig)
return fig
display_rdm(data)
This RDM reveals that this population of P = 2 neurons distinguishes the stimulus dots based on their color and size – as expected!
What if we want to study the underlying structure of this code? Is there another view of the population code that might be more informative?
Instead of directly studying the neurons, we can focus on the underlying factors that capture the structure and variance in the data.
Since we only have P = 2 neurons, we can visualize these data as a scatterplot, which makes their covariance apparent.
def view_joint_scatter(
data: xr.DataArray,
*,
coord: str,
dim: str,
template_func: Callable[[int], str],
draw_axes: bool = False,
) -> Figure:
fig, ax = plt.subplots()
data_ = pd.DataFrame(
{coord_: data[coord_].values for coord_ in ("color", "size")}
| {
template_func(index): data.isel({dim: index - 1}).to_dataframe()[coord]
for index in (1, 2)
}
)
sns.scatterplot(
ax=ax,
data=data_,
x=template_func(1),
y=template_func(2),
hue="color",
size="size",
legend=False,
palette="flare",
)
if draw_axes:
ax.axhline(0, c="gray", ls="--")
ax.axvline(0, c="gray", ls="--")
ax.set_aspect("equal", "box")
sns.despine(ax=ax, offset=20)
plt.close(fig)
return fig
view_joint_scatter(
data,
coord="neuron responses",
dim="neuron",
template_func=lambda x: f"neuron {x} response",
)
We can use the covariance to change the way we view the data.
Click on the animation above to visualize the PCA transformation!
def animate_pca_transformation(
data: xr.DataArray,
*,
durations: dict[str, int] = {
"center": 1_000,
"rotate": 1_000,
"pause": 500,
},
interval: int = 50,
) -> str:
def _compute_2d_rotation_matrix(theta: float) -> np.ndarray:
return np.array(
[
[np.cos(theta), -np.sin(theta)],
[np.sin(theta), np.cos(theta)],
]
)
fig = view_joint_scatter(
data,
coord="neuron responses",
dim="neuron",
template_func=lambda x: f"neuron {x} response",
draw_axes=True,
)
ax = fig.get_axes()[0]
scatter = ax.get_children()[0]
title = fig.suptitle("neuron responses")
n_frames = {key: value // interval + 1 for key, value in durations.items()}
x_mean, y_mean = data.mean("stimulus").values
delta = np.array([x_mean, y_mean]) / n_frames["center"]
_, _, v_h = np.linalg.svd(data - data.mean("stimulus"))
v = v_h.transpose()
theta = np.arccos(v[0, 0])
rotation = _compute_2d_rotation_matrix(-theta / n_frames["rotate"])
transformed = (data - data.mean("stimulus")).values @ v
radius = max(np.linalg.norm(transformed, axis=-1))
limit = max(np.abs(data).max(), np.abs(transformed).max(), radius)
ax.set_xlim([-limit, limit])
ax.set_ylim([-limit, limit])
fig.tight_layout()
frame_to_retitle_center = 2 * n_frames["pause"]
frame_to_start_centering = frame_to_retitle_center + n_frames["pause"]
frame_to_stop_centering = frame_to_start_centering + n_frames["center"]
frame_to_retitle_rotate = frame_to_stop_centering + n_frames["pause"]
frame_to_start_rotating = frame_to_retitle_rotate + n_frames["pause"]
frame_to_stop_rotating = frame_to_start_rotating + n_frames["rotate"]
frame_to_retitle_transformed = frame_to_stop_rotating + n_frames["pause"]
frame_to_end = frame_to_retitle_transformed + 2 * n_frames["pause"]
def _update(frame: int) -> None:
if frame < frame_to_retitle_center:
return
elif frame == frame_to_retitle_center:
title.set_text("step 1 of 2: center the data")
ax.set_xlabel("")
ax.set_ylabel("")
elif frame < frame_to_start_centering:
return
elif frame <= frame_to_stop_centering:
scatter.set_offsets(scatter.get_offsets() - delta)
elif frame == frame_to_retitle_rotate:
title.set_text("step 2 of 2: rotate the data")
elif frame < frame_to_start_rotating:
return
elif frame <= frame_to_stop_rotating:
scatter.set_offsets(scatter.get_offsets().data @ rotation)
elif frame < frame_to_retitle_transformed:
return
elif frame == frame_to_retitle_transformed:
title.set_text("principal components")
ax.set_xlabel("principal component 1")
ax.set_ylabel("principal component 2")
elif frame <= frame_to_end:
return
animation = FuncAnimation(
fig=fig,
func=_update,
frames=frame_to_end,
interval=interval,
repeat=False,
)
plt.close(fig)
return animation.to_html5_video()
display(HTML(animate_pca_transformation(data)))
As seen in the animation, we can transform our data to view the directions of maximum variance. These directions are the principal components of our data.
Given a data matrix X \in \mathbb{R}^{N \times P}, we need to compute the eigendecomposition1 of its covariance2:
\begin{align*} \text{cov}(X) &= \left(\dfrac{1}{n - 1}\right) (X - \overline{X})^\top (X - \overline{X})\\ &= V \Lambda V^\top \end{align*}
To do this, we start by computing the covariance of our data matrix, where X is centered (i.e. X - \overline{X})
Next, we compute the eigendecomposition of the covariance:
The columns of V are eigenvectors that specify the directions of variance while the corresponding diagonal elements of \Lambda are eigenvalues that specify the amount of variance along the eigenvector3.
Finally, the original data matrix can be transformed by projecting it onto the eigenvectors: \widetilde{X} = \left(X - \overline{X}\right) V.
class PCA:
def __init__(self) -> None:
self.mean: np.ndarray
self.eigenvectors: np.ndarray
self.eigenvalues: np.ndarray
def fit(self, /, data: np.ndarray) -> None:
self.mean = data.mean(axis=-2)
1 data_centered = data - self.mean
2 _, s, v_t = np.linalg.svd(data_centered)
n_stimuli = data.shape[-2]
3 self.eigenvectors = np.swapaxes(v_t, -1, -2)
4 self.eigenvalues = s**2 / (n_stimuli - 1)
def transform(self, /, data: np.ndarray) -> np.ndarray:
5 return (data - self.mean) @ self.eigenvectors
To apply PCA to a data matrix, we might be tempted to use the definition and naively compute its covariance followed by an eigendecomposition. However, when the number of neurons P is large, this approach is memory-intensive and prone to numerical errors.
Instead, we can use the singular value decomposition (SVD) of X to efficiently compute its PCA transformation. Specifically, X = U \Sigma V^\top is a singular value decomposition, where U and V are orthonormal and \Sigma is diagonal.
The covariance matrix reduces to X^\top X / (n - 1) = V \left(\frac{\Sigma^2}{n - 1} \right) V^\top, which is exactly the eigendecomposition required.
Specifically, the eigenvalues \lambda_i of the covariance matrix are related to the singular values \sigma_i of the data matrix as \lambda_i = \sigma_i^2 / (N - 1), while the eigenvectors of the covariance matrix are exactly the right singular vectors V of the data matrix X.
Check out truncated SVD!
Let’s now project our data onto its principal components and analyze it in this space.
def compute_pca(data: xr.DataArray) -> xr.Dataset:
pca = PCA()
pca.fit(data.values)
data_transformed = pca.transform(data.values)
return xr.Dataset(
data_vars={
"score": xr.DataArray(
data=data_transformed,
dims=("stimulus", "component"),
),
"eigenvector": xr.DataArray(
data=pca.eigenvectors,
dims=("component", "neuron"),
),
},
coords={
"rank": ("component", 1 + np.arange(data_transformed.shape[-1])),
"eigenvalue": ("component", pca.eigenvalues),
}
| {coord: (data[coord].dims[0], data[coord].values) for coord in data.coords},
)
pca = compute_pca(data)
display(pca["score"])
<xarray.DataArray 'score' (stimulus: 1000, component: 2)> Size: 16kB 11.19 1.269 -10.77 -1.394 -9.962 -1.541 ... -1.028 8.777 -5.869 3.39 -2.457 Coordinates: (3/5) * stimulus (stimulus) int64 8kB 1 2 3 4 5 6 7 ... 995 996 997 998 999 1000 rank (component) int64 16B 1 2 ... ... size (stimulus) float64 8kB 0.01301 0.8278 0.4962 ... 0.08158 0.3216 Dimensions without coordinates: component
array([[ 1.11911792e+01, 1.26889981e+00], [-1.07729551e+01, -1.39397546e+00], [-9.96219137e+00, -1.54107707e+00], ..., [ 6.58127224e-03, -1.02783716e+00], [ 8.77667289e+00, -5.86946476e+00], [ 3.39020249e+00, -2.45675640e+00]])
array([ 1, 2, 3, ..., 998, 999, 1000])
array([1, 2])
array([46.29579973, 6.14792569])
array([6.36961687e-01, 2.69786714e-01, 4.09735239e-02, 1.65276355e-02, 8.13270239e-01, 9.12755577e-01, 6.06635776e-01, 7.29496561e-01, 5.43624991e-01, 9.35072424e-01, 8.15853554e-01, 2.73850017e-03, 8.57404277e-01, 3.35855753e-02, 7.29655446e-01, 1.75655621e-01, 8.63178922e-01, 5.41461220e-01, 2.99711891e-01, 4.22687221e-01, 2.83196711e-02, 1.24283276e-01, 6.70624415e-01, 6.47189512e-01, 6.15385111e-01, 3.83677554e-01, 9.97209936e-01, 9.80835339e-01, 6.85541984e-01, 6.50459276e-01, 6.88446731e-01, 3.88921424e-01, 1.35096505e-01, 7.21488340e-01, 5.25354322e-01, 3.10241876e-01, 4.85835359e-01, 8.89487834e-01, 9.34043516e-01, 3.57795197e-01, 5.71529831e-01, 3.21869391e-01, 5.94300030e-01, 3.37911226e-01, 3.91619001e-01, 8.90274352e-01, 2.27157594e-01, 6.23187145e-01, 8.40153436e-02, 8.32644148e-01, 7.87098307e-01, 2.39369443e-01, 8.76484231e-01, 5.85680348e-02, 3.36117061e-01, 1.50279467e-01, 4.50339367e-01, 7.96324270e-01, 2.30642209e-01, 5.20213011e-02, 4.04551840e-01, 1.98513045e-01, 9.07530456e-02, 5.80332386e-01, 2.98696133e-01, 6.71994878e-01, 1.99515444e-01, 9.42113111e-01, 3.65110168e-01, 1.05495280e-01, 6.29108152e-01, 9.27154553e-01, 4.40377155e-01, 9.54590494e-01, 4.99895814e-01, 4.25228625e-01, 6.20213452e-01, 9.95096505e-01, 9.48943675e-01, 4.60045139e-01, ... 3.39811155e-01, 2.21699710e-04, 4.82537622e-01, 6.08000665e-01, 9.29904601e-02, 2.42094402e-01, 8.03991821e-01, 8.40281560e-01, 3.87733254e-01, 8.14223731e-01, 2.77140253e-01, 7.06108222e-01, 5.45456624e-01, 4.40099079e-01, 6.56442276e-01, 1.33906791e-02, 1.62443443e-01, 2.93823464e-01, 6.80562610e-01, 7.06235313e-01, 6.80760824e-01, 7.67617107e-01, 7.95515610e-02, 1.05888908e-01, 8.55351160e-01, 3.56837753e-01, 5.68371290e-01, 5.03502806e-01, 6.26662571e-01, 7.69467112e-02, 7.69790226e-01, 1.23402328e-01, 6.81374462e-01, 4.02142099e-01, 4.92261636e-01, 6.71693734e-01, 3.71002751e-01, 4.60374714e-02, 9.64211552e-01, 5.22676899e-01, 7.42144641e-01, 5.31294834e-01, 8.19686903e-01, 5.64616179e-01, 1.22756878e-01, 6.41906565e-01, 1.72740669e-01, 8.23654135e-01, 6.81061600e-01, 9.39808638e-01, 6.29080790e-01, 2.25163097e-01, 5.57137546e-01, 7.71772277e-01, 7.11888298e-01, 3.42296706e-01, 6.55351151e-01, 9.35269061e-01, 6.84810043e-01, 3.67301402e-01, 9.10758330e-01, 8.27624193e-01, 8.55183760e-01, 1.06841377e-01, 2.90828834e-01, 7.90127803e-01, 2.74807496e-01, 7.37059356e-02, 6.83266012e-01, 7.99269956e-01, 6.41767808e-01, 3.44843336e-01, 5.59773319e-01, 2.15199514e-02, 5.62661656e-01, 8.56801175e-01, 7.80532350e-02, 3.83319397e-01, 1.64863718e-01, 3.80007897e-01])
array([0.01300767, 0.82776292, 0.49624331, 0.43591807, 0.60179472, 0.8500282 , 0.29126072, 0.26751697, 0.04949421, 0.26639909, 0.06621185, 0.04155849, 0.55273056, 0.18383489, 0.07425772, 0.91671474, 0.14873389, 0.09483079, 0.97067806, 0.66696696, 0.72575404, 0.563204 , 0.07038971, 0.84187724, 0.418029 , 0.39246782, 0.13530924, 0.11321989, 0.52224593, 0.56874359, 0.51868554, 0.61312468, 0.87764346, 0.50420484, 0.37914768, 0.2565727 , 0.30684664, 0.56080705, 0.79537234, 0.44112103, 0.04076233, 0.1881545 , 0.09065223, 0.33334341, 0.68437666, 0.59071473, 0.66212759, 0.45459513, 0.10978054, 0.29625596, 0.51096048, 0.49716497, 0.24366139, 0.82530156, 0.43331334, 0.84545613, 0.26549263, 0.94193959, 0.11185735, 0.76918249, 0.02018645, 0.23632066, 0.8705533 , 0.350105 , 0.93247949, 0.929417 , 0.80019258, 0.39610545, 0.8582685 , 0.45710434, 0.1261722 , 0.85195837, 0.8162467 , 0.13556544, 0.86652652, 0.51896305, 0.74359076, 0.26817602, 0.21546148, 0.84831281, 0.6002138 , 0.14770547, 0.36587009, 0.85903582, 0.46828358, 0.3368529 , 0.34095386, 0.8246442 , 0.45429903, 0.9483535 , 0.31220015, 0.75648033, 0.28570549, 0.7678388 , 0.01759798, 0.12982098, 0.25925691, 0.87009196, 0.32249838, 0.48352554, ... 0.25857775, 0.09677339, 0.18039325, 0.25448073, 0.83934999, 0.22122572, 0.82839782, 0.74329961, 0.97429452, 0.75359014, 0.11511305, 0.94004275, 0.84209462, 0.4436286 , 0.43358894, 0.03095428, 0.21764154, 0.71488612, 0.1105268 , 0.99174489, 0.02168247, 0.99097346, 0.29620757, 0.46086521, 0.54547145, 0.28955319, 0.22049757, 0.0505692 , 0.823433 , 0.97230937, 0.12555748, 0.85983084, 0.72053945, 0.75022534, 0.38780546, 0.05757491, 0.69042312, 0.96212643, 0.65875825, 0.23405231, 0.53673526, 0.12167514, 0.71787855, 0.67286419, 0.446794 , 0.00866449, 0.06036607, 0.68197769, 0.69241334, 0.57520933, 0.22769122, 0.66313159, 0.1055052 , 0.62768057, 0.57231791, 0.35549364, 0.21916604, 0.7245827 , 0.18099196, 0.15735466, 0.63276412, 0.66062453, 0.1019691 , 0.26228831, 0.09859603, 0.91383146, 0.00838652, 0.35101158, 0.15682101, 0.46699488, 0.90683736, 0.70737638, 0.36017375, 0.1866537 , 0.70501213, 0.54191587, 0.72028997, 0.04485529, 0.17314062, 0.3198733 , 0.4665367 , 0.56944602, 0.56209399, 0.54291566, 0.56611634, 0.41747562, 0.27881285, 0.51812954, 0.12186051, 0.74927137, 0.95293572, 0.05409649, 0.78232714, 0.22383289, 0.33641135, 0.03353818, 0.9690858 , 0.56209561, 0.08158143, 0.32155563])
PandasIndex(Index([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000], dtype='int64', name='stimulus', length=1000))
The eigenvectors represent the directions of variance in our data, sorted in descending order of variance.
with xr.set_options(display_expand_data=True):
display(pca["eigenvector"])
fig = view_joint_scatter(
data,
coord="neuron responses",
dim="neuron",
template_func=lambda x: f"neuron {x} response",
)
ax = fig.get_axes()[0]
ax.axline(
data.mean("stimulus").values,
slope=-pca["eigenvector"].values[0, 1] / pca["eigenvector"].values[0, 0],
c="k",
lw=3,
label="PC 1",
)
ax.axline(
data.mean("stimulus").values,
slope=-pca["eigenvector"].values[1, 1] / pca["eigenvector"].values[1, 0],
c="gray",
lw=2,
label="PC 2",
)
ax.legend()
display(fig)
<xarray.DataArray 'eigenvector' (component: 2, neuron: 2)> Size: 32B array([[ 0.43538525, 0.90024423], [-0.90024423, 0.43538525]]) Coordinates: (3/7) * neuron (neuron) int64 16B 1 2 rank (component) int64 16B 1 2 ... ... std (neuron) float64 16B 1.0 3.0 Dimensions without coordinates: component
array([[ 0.43538525, 0.90024423], [-0.90024423, 0.43538525]])
array([1, 2])
array([1, 2])
array([46.29579973, 6.14792569])
array([ 3., -2.])
array([-2., 5.])
array([ 7., -6.])
array([1., 3.])
PandasIndex(Index([1, 2], dtype='int64', name='neuron'))
We can view the data projected onto each of the principal components.
We observe that:
Note that these components do not directly correspond to either of the latent variables. Rather, each is a mixture of stimulus-dependent signal and noise.
The eigenvalues show the variance along each eigenvector. The total variance along all the eigenvectors is the sum of the eigenvalues.
variance along eigenvector 1 (eigenvalue 1): 46.296
variance along eigenvector 2 (eigenvalue 2): 6.148
total variance: 52.444
We can also see that this is equal to the total variance in the original data, since the PCA transformation corresponds to a simple rotation.
variance of neuron 1 responses: 13.758
variance of neuron 2 responses: 38.685
total variance: 52.443
We can plot the eigenvalues as a function of their rank to visualize the eigenspectrum. As we will see shortly, the eigenspectrum provides valuable insights about the latent dimensionality of the data.
def view_eigenspectrum(pca: xr.DataArray) -> Figure:
fig, ax = plt.subplots(figsize=(pca.sizes["component"], 5))
sns.lineplot(
ax=ax,
data=pca["component"].to_dataframe(),
x="rank",
y="eigenvalue",
marker="s",
)
ax.set_xticks(pca["rank"].values)
ax.set_ylim(bottom=0)
sns.despine(ax=ax, offset=20)
plt.close(fig)
return fig
view_eigenspectrum(pca)
In these data, the dimensionality is clear: there are two latent variables and both are evident in the principal components. However, in real data, we typically record from more than P = 2 neurons; therefore, judging the dimensionality becomes tricky. To simulate such a scenario, let’s record from more neurons (say P = 10).
def _simulate_random_neuron(rng: np.random.Generator) -> Neuron:
return Neuron(
beta_color=rng.integers(-10, 11),
beta_size=rng.integers(-10, 11),
std=rng.integers(-10, 11),
mean=rng.integers(-10, 11),
)
neurons = tuple([_simulate_random_neuron(rng) for _ in range(10)])
big_data = simulate_multiple_neuron_responses(
stimuli=stimuli,
neurons=neurons,
rng=rng,
)
display(big_data)
<xarray.DataArray 'neuron responses' (stimulus: 1000, neuron: 10)> Size: 80kB -1.747 -8.53 -7.006 4.696 8.766 28.16 ... 8.782 0.7017 -2.132 -1.981 -1.87 Coordinates: (3/8) * neuron (neuron) int64 80B 1 2 3 4 5 6 7 8 9 10 * stimulus (stimulus) int64 8kB 1 2 3 4 5 6 7 ... 995 996 997 998 999 1000 ... ... std (neuron) float64 80B -5.0 2.0 -3.0 5.0 ... -2.0 -4.0 -2.0 -8.0
array([[ -1.7468685 , -8.52988047, -7.00619911, ..., -12.08407229, -6.11588709, 5.79962078], [ 0.5417999 , 5.76857817, -14.19056082, ..., -7.57254665, 15.63044955, -12.08460524], [-15.14390287, 7.43798007, -11.87857537, ..., -5.60388679, 1.55649921, -0.7286054 ], ..., [-11.867323 , 3.04997894, -8.19304571, ..., -13.63460347, 7.49747562, 10.62648332], [-23.71380179, 4.65415007, -5.80502762, ..., -7.50271373, -10.13915485, -8.14599889], [ -7.25947058, 1.02221347, -8.99930611, ..., -2.13177182, -1.98124877, -1.87010738]])
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
array([ 1, 2, 3, ..., 998, 999, 1000])
array([6.36961687e-01, 2.69786714e-01, 4.09735239e-02, 1.65276355e-02, 8.13270239e-01, 9.12755577e-01, 6.06635776e-01, 7.29496561e-01, 5.43624991e-01, 9.35072424e-01, 8.15853554e-01, 2.73850017e-03, 8.57404277e-01, 3.35855753e-02, 7.29655446e-01, 1.75655621e-01, 8.63178922e-01, 5.41461220e-01, 2.99711891e-01, 4.22687221e-01, 2.83196711e-02, 1.24283276e-01, 6.70624415e-01, 6.47189512e-01, 6.15385111e-01, 3.83677554e-01, 9.97209936e-01, 9.80835339e-01, 6.85541984e-01, 6.50459276e-01, 6.88446731e-01, 3.88921424e-01, 1.35096505e-01, 7.21488340e-01, 5.25354322e-01, 3.10241876e-01, 4.85835359e-01, 8.89487834e-01, 9.34043516e-01, 3.57795197e-01, 5.71529831e-01, 3.21869391e-01, 5.94300030e-01, 3.37911226e-01, 3.91619001e-01, 8.90274352e-01, 2.27157594e-01, 6.23187145e-01, 8.40153436e-02, 8.32644148e-01, 7.87098307e-01, 2.39369443e-01, 8.76484231e-01, 5.85680348e-02, 3.36117061e-01, 1.50279467e-01, 4.50339367e-01, 7.96324270e-01, 2.30642209e-01, 5.20213011e-02, 4.04551840e-01, 1.98513045e-01, 9.07530456e-02, 5.80332386e-01, 2.98696133e-01, 6.71994878e-01, 1.99515444e-01, 9.42113111e-01, 3.65110168e-01, 1.05495280e-01, 6.29108152e-01, 9.27154553e-01, 4.40377155e-01, 9.54590494e-01, 4.99895814e-01, 4.25228625e-01, 6.20213452e-01, 9.95096505e-01, 9.48943675e-01, 4.60045139e-01, ... 3.39811155e-01, 2.21699710e-04, 4.82537622e-01, 6.08000665e-01, 9.29904601e-02, 2.42094402e-01, 8.03991821e-01, 8.40281560e-01, 3.87733254e-01, 8.14223731e-01, 2.77140253e-01, 7.06108222e-01, 5.45456624e-01, 4.40099079e-01, 6.56442276e-01, 1.33906791e-02, 1.62443443e-01, 2.93823464e-01, 6.80562610e-01, 7.06235313e-01, 6.80760824e-01, 7.67617107e-01, 7.95515610e-02, 1.05888908e-01, 8.55351160e-01, 3.56837753e-01, 5.68371290e-01, 5.03502806e-01, 6.26662571e-01, 7.69467112e-02, 7.69790226e-01, 1.23402328e-01, 6.81374462e-01, 4.02142099e-01, 4.92261636e-01, 6.71693734e-01, 3.71002751e-01, 4.60374714e-02, 9.64211552e-01, 5.22676899e-01, 7.42144641e-01, 5.31294834e-01, 8.19686903e-01, 5.64616179e-01, 1.22756878e-01, 6.41906565e-01, 1.72740669e-01, 8.23654135e-01, 6.81061600e-01, 9.39808638e-01, 6.29080790e-01, 2.25163097e-01, 5.57137546e-01, 7.71772277e-01, 7.11888298e-01, 3.42296706e-01, 6.55351151e-01, 9.35269061e-01, 6.84810043e-01, 3.67301402e-01, 9.10758330e-01, 8.27624193e-01, 8.55183760e-01, 1.06841377e-01, 2.90828834e-01, 7.90127803e-01, 2.74807496e-01, 7.37059356e-02, 6.83266012e-01, 7.99269956e-01, 6.41767808e-01, 3.44843336e-01, 5.59773319e-01, 2.15199514e-02, 5.62661656e-01, 8.56801175e-01, 7.80532350e-02, 3.83319397e-01, 1.64863718e-01, 3.80007897e-01])
array([0.01300767, 0.82776292, 0.49624331, 0.43591807, 0.60179472, 0.8500282 , 0.29126072, 0.26751697, 0.04949421, 0.26639909, 0.06621185, 0.04155849, 0.55273056, 0.18383489, 0.07425772, 0.91671474, 0.14873389, 0.09483079, 0.97067806, 0.66696696, 0.72575404, 0.563204 , 0.07038971, 0.84187724, 0.418029 , 0.39246782, 0.13530924, 0.11321989, 0.52224593, 0.56874359, 0.51868554, 0.61312468, 0.87764346, 0.50420484, 0.37914768, 0.2565727 , 0.30684664, 0.56080705, 0.79537234, 0.44112103, 0.04076233, 0.1881545 , 0.09065223, 0.33334341, 0.68437666, 0.59071473, 0.66212759, 0.45459513, 0.10978054, 0.29625596, 0.51096048, 0.49716497, 0.24366139, 0.82530156, 0.43331334, 0.84545613, 0.26549263, 0.94193959, 0.11185735, 0.76918249, 0.02018645, 0.23632066, 0.8705533 , 0.350105 , 0.93247949, 0.929417 , 0.80019258, 0.39610545, 0.8582685 , 0.45710434, 0.1261722 , 0.85195837, 0.8162467 , 0.13556544, 0.86652652, 0.51896305, 0.74359076, 0.26817602, 0.21546148, 0.84831281, 0.6002138 , 0.14770547, 0.36587009, 0.85903582, 0.46828358, 0.3368529 , 0.34095386, 0.8246442 , 0.45429903, 0.9483535 , 0.31220015, 0.75648033, 0.28570549, 0.7678388 , 0.01759798, 0.12982098, 0.25925691, 0.87009196, 0.32249838, 0.48352554, ... 0.25857775, 0.09677339, 0.18039325, 0.25448073, 0.83934999, 0.22122572, 0.82839782, 0.74329961, 0.97429452, 0.75359014, 0.11511305, 0.94004275, 0.84209462, 0.4436286 , 0.43358894, 0.03095428, 0.21764154, 0.71488612, 0.1105268 , 0.99174489, 0.02168247, 0.99097346, 0.29620757, 0.46086521, 0.54547145, 0.28955319, 0.22049757, 0.0505692 , 0.823433 , 0.97230937, 0.12555748, 0.85983084, 0.72053945, 0.75022534, 0.38780546, 0.05757491, 0.69042312, 0.96212643, 0.65875825, 0.23405231, 0.53673526, 0.12167514, 0.71787855, 0.67286419, 0.446794 , 0.00866449, 0.06036607, 0.68197769, 0.69241334, 0.57520933, 0.22769122, 0.66313159, 0.1055052 , 0.62768057, 0.57231791, 0.35549364, 0.21916604, 0.7245827 , 0.18099196, 0.15735466, 0.63276412, 0.66062453, 0.1019691 , 0.26228831, 0.09859603, 0.91383146, 0.00838652, 0.35101158, 0.15682101, 0.46699488, 0.90683736, 0.70737638, 0.36017375, 0.1866537 , 0.70501213, 0.54191587, 0.72028997, 0.04485529, 0.17314062, 0.3198733 , 0.4665367 , 0.56944602, 0.56209399, 0.54291566, 0.56611634, 0.41747562, 0.27881285, 0.51812954, 0.12186051, 0.74927137, 0.95293572, 0.05409649, 0.78232714, 0.22383289, 0.33641135, 0.03353818, 0.9690858 , 0.56209561, 0.08158143, 0.32155563])
array([10., -4., 1., 7., 10., 10., -9., -4., 0., 10.])
array([ 7., 1., -3., 6., 1., -7., 4., -3., 8., 6.])
array([ 0., -1., -9., 2., 1., 7., -1., -9., 3., 8.])
array([ -5., 2., -3., 5., -10., -4., -2., -4., -2., -8.])
PandasIndex(Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='int64', name='neuron'))
PandasIndex(Index([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000], dtype='int64', name='stimulus', length=1000))
As before, we can visualize each principal component and plot the eigenspectrum:
We know by design that this data was generated from 2 latent variables – color and size. However, in real datasets with naturalistic stimuli, we often don’t know what the latent variables are! It’s common to use the eigenspectrum to estimate the latent dimensionality of the data. For instance, inspecting the eigenspectrum of our toy example tells us that the first two dimensions have much higher variance than the rest. We refer to these as the effective dimensions.
In general, there are several approaches for estimating dimensionality based on the eigenspectrum:
The rank of the covariance matrix – equal to the number of nonzero eigenvalues – would be the latent dimensionality in the ideal setting where the data has zero noise. In real data, the rank is typically equal to the ambient dimensionality (which here is the number of neurons we record from), since there is typically some variance along every dimension.
print(f"rank = {(big_pca.eigenvalue > 0).sum().values}")
def view_thresholded_eigenspectrum(pca: PCA, *, threshold: int | float) -> Figure:
fig = view_eigenspectrum(pca)
ax = fig.get_axes()[0]
xlim = ax.get_xlim()
ylim = ax.get_ylim()
ax_twin = ax.twinx()
ax_twin.set_ylabel("proportion of variance")
ax_twin.axhline(threshold, ls="--", c="gray")
ax_twin.yaxis.set_major_formatter(ticker.PercentFormatter(1))
total_variance = pca["eigenvalue"].sum().values
ylim /= total_variance
ax_twin.set_ylim(ylim)
ax_twin.fill_between(x=xlim, y1=ylim[-1], y2=threshold, color="green", alpha=0.1)
ax_twin.fill_between(x=xlim, y1=ylim[0], y2=threshold, color="red", alpha=0.1)
sns.despine(ax=ax_twin, offset=20, left=True, bottom=True, right=False, top=True)
return fig
view_thresholded_eigenspectrum(big_pca, threshold=0)
rank = 10
A very commonly used method is to set a threshold based on the cumulative variance of the data: the number of dimensions required to exceed, say 90\% of the variance, is taken as the latent dimensionality.
def view_cumulative_eigenspectrum(
pca: xr.DataArray, *, threshold: float = 0.9
) -> Figure:
fig, ax = plt.subplots(figsize=(pca.sizes["component"], 5))
data = pca["eigenvalue"].copy()
total_variance = data.sum().values
data["eigenvalue"] = data.cumsum()
data = data.rename({"eigenvalue": "cumulative variance"})
data["cumulative proportion of variance"] = (
data["cumulative variance"] / pca["eigenvalue"].sum()
)
sns.lineplot(
ax=ax,
data=data.to_dataframe(),
x="rank",
y="cumulative variance",
marker="s",
)
ax.set_xticks(pca["rank"].values)
xlim = ax.get_xlim()
ylim = ax.get_ylim()
ylim /= total_variance
ax_twin = ax.twinx()
ax_twin.set_ylabel("cumulative proportion of variance")
ax_twin.set_ylim(ylim)
ax_twin.axhline(threshold, ls="--", c="gray")
ax_twin.fill_between(x=xlim, y1=ylim[-1], y2=threshold, color="red", alpha=0.1)
ax_twin.fill_between(x=xlim, y1=ylim[0], y2=threshold, color="green", alpha=0.1)
sns.despine(ax=ax, offset=20)
sns.despine(ax=ax_twin, offset=20, left=True, bottom=True, right=False, top=True)
ax_twin.yaxis.set_major_formatter(ticker.PercentFormatter(1))
fig.tight_layout()
plt.close(fig)
return fig
view_cumulative_eigenspectrum(big_pca)
When the number of latent dimensions is low, eigenspectra often have a sharp discontinuity (the “knee”, or “elbow”), where a small number of dimensions have high-variance and the remainder have much have lower variance. The latent dimensionality is then taken to be the number of dimensions above this threshold determined by eye.
def view_eigenspectrum_knee(pca: PCA, *, knee: int) -> Figure:
fig = view_eigenspectrum(pca)
ax = fig.get_axes()[0]
ax.plot(
knee,
big_pca["eigenvalue"].isel({"component": big_pca["rank"] == knee}).values,
"o",
ms=30,
mec="r",
mfc="none",
mew=3,
)
ax.axvline(knee, ls="--", c="gray")
xlim = ax.get_xlim()
ylim = ax.get_ylim()
ax.fill_betweenx(y=ylim, x1=xlim[-1], x2=knee, color="red", alpha=0.1)
ax.fill_betweenx(y=ylim, x1=xlim[0], x2=knee, color="green", alpha=0.1)
return fig
view_eigenspectrum_knee(big_pca, knee=3)
So far, all of the techniques discussed depend on a paramter that needs to be decided by the researcher. For instance, a cumulative threshold of 90% is an arbitrary choice, and if we decided to choose 95% instead, we would get a different number of dimensions. This is why sometimes a parameter-free estimate is preferred.
A metric such as effective dimensionality summarizes the spectrum using an entropy-like measure, taking into account variances along all the dimensions:
d_\text{eff}(\lambda_1, \dots \lambda_n) = \dfrac{\left( \sum_{i=1}^n \lambda_i \right)^2}{\sum_{i=1}^n \lambda_i^2}
def compute_effective_dimensionality(eigenspectrum: np.ndarray) -> float:
return (np.sum(eigenspectrum) ** 2) / (eigenspectrum**2).sum()
def view_effective_dimensionality_examples(pca: PCA) -> Figure:
n_components = pca.sizes["component"]
spectrum_1 = [float(pca["eigenvalue"].mean().values)] * n_components
weights = 1 / (1 + np.arange(n_components, dtype=np.float32))
weights /= weights.sum()
spectrum_2 = list(weights * pca["eigenvalue"].sum().values)
spectrum_3 = [float(pca["eigenvalue"].sum().values)] + [0] * (n_components - 1)
data = pd.DataFrame(
{
"eigenvalue": spectrum_1 + spectrum_2 + spectrum_3,
"rank": list(np.tile(pca["rank"].values, 3)),
"example": [0] * n_components + [1] * n_components + [2] * n_components,
}
)
# return data
g = sns.relplot(
kind="line",
data=data,
col="example",
x="rank",
y="eigenvalue",
marker="o",
height=3,
aspect=1,
facet_kws={
"sharey": False,
},
)
for i_spectrum, ax in enumerate(g.axes.flat):
d = compute_effective_dimensionality(
data.loc[data["example"] == i_spectrum, "eigenvalue"]
)
ax.set_title("$d_{eff}$" + f" = {d.round(2)}")
ax.set_xticks([1, 10])
sns.despine(g.figure, offset=10)
g.figure.tight_layout(w_pad=2)
plt.close(g.figure)
return g.figure
print(
"effective dimensionality ="
f" {compute_effective_dimensionality(big_pca['eigenvalue']).values.round(2)}"
)
view_effective_dimensionality_examples(big_pca)
effective dimensionality = 2.55
Before PCA, it’s often recommended to preprocess the data by Z-scoring each of the input features X – ensuring that they have zero mean and unit variance:
Z = \dfrac{X - \mu}{\sigma}
Often, PCA is applied to data where the features are fundamentally different from each other. For example, we might have a dataset where the features of interest are the prices of cars (in dollars) and their masses (in kilograms). Since these two features have different units, the variances of the features are not directly comparable – there’s no obvious way to numerically compare a variance of ($20,000)2 in price and a variance of (1,000 kg)2 in mass. Even if the features being compared are all the same, if they are in different units – say euros, dollars, and cents – the raw variances of the data matrix are meaningless.
Since PCA implicitly assumes that the variances along each dimension are comparable, we can Z-score each of the features before applying PCA to ensure that they are on a common scale.
Note, however, that this transformation reduces the information in the system – it is possible that the variances of the features are informative.