### PCA and Data Whitening¶

In [1]:
import numpy as np
#from numpy import *
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.patches import FancyArrowPatch
from mpl_toolkits.mplot3d import proj3d
%matplotlib inline


### Eigen vectors and Eigen values¶

Eigen equation:

$$W \vec x = \lambda \vec x$$
• $\vec x$ with this property is called Eigen vector of $W$
• with the corresponding eigenvalue $\lambda$

#### (Random) data¶

In [2]:
cov = np.array([[2.0, 0.8], [0.8, 0.6]])
mean = np.array([1., 1.])

rd = np.random.multivariate_normal(mean, cov, 50)

def plot_data():
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
ax.scatter(rd[...,0], rd[...,1], c='b', marker='x', label="data")
ax.set_xlim(-2.,4.)
ax.set_ylim(-2.,4.)
ax.set_xlabel('x_values')
ax.set_ylabel('y_values')

plot_data()

In [3]:
mean_empirical = rd.mean(axis=0)
# here you see that the empirical mean is different from the "true" mean.
mean_empirical

Out[3]:
array([ 0.84778864,  0.92297188])

### Covariance matrix¶

In [4]:
n = rd.shape[0]

# definition
cov_mat = 1./(n-1)*(rd - mean_empirical).T.dot((rd - mean_empirical))

cov_mat_np = np.cov(rd, rowvar=0)

In [5]:
np.testing.assert_array_almost_equal(cov_mat, cov_mat_np)

x = np.array([1.,-1.])
x = x/np.linalg.norm(x)

x = cov_mat.dot(x)
x = x/np.linalg.norm(x)

In [6]:
x = np.array([1.,-1.])
x = x/np.linalg.norm(x)


Repeated multiplication of a (renormed) vector with the covariance matrix:

In [7]:
plt.xlim(-1.1,1.1)
plt.ylim(-1.1,1.1)
for i in range(10):
plt.arrow(0,0, x[0], x[1], alpha=0.5)
x = cov_mat.dot(x)
x = x/np.linalg.norm(x)


The vector rotates to a "fix point" vector. The fix point vector is the eigen vector corresponding to the largest eingenvalue.

In [8]:
eig_val, eig_vector_matrix = np.linalg.eig(cov_mat)
eig_vector_1 = eig_vector_matrix[:,0]
eig_val_1 = eig_val[0]
eig_vector_2 = eig_vector_matrix[:,1]
eig_val_2 = eig_val[1]
eig_vector_1

Out[8]:
array([ 0.91575757,  0.40173133])

From the eigen equation: $$\frac{W \vec x}{\lambda} = \vec x$$

In [9]:
x = eig_vector_1
print x
# this don't changes x:
x = cov_mat.dot(x)
x = x/eig_val[0]
print x

[ 0.91575757  0.40173133]
[ 0.91575757  0.40173133]


For visualization take the square root of the eigenvalues: in the new corrdinate system the cov-matrix is diagonal. the eigenvalues correspond to the variances. For visualization the standard deviation is more appropriate, so we take the square root of the variance.

In [10]:
plot_data()
s1 = np.sqrt(eig_val_1)
s2 = np.sqrt(eig_val_2)
plt.arrow(mean_empirical[0],mean_empirical[1], s1 * eig_vector_1[0], s1 * eig_vector_1[1], alpha=1., color='r')
plt.arrow(mean_empirical[0],mean_empirical[1], s2 * eig_vector_2[0], s2 * eig_vector_2[1], alpha=1., color='g')

Out[10]:
<matplotlib.patches.FancyArrow at 0x10d8b2ad0>

### PCA (Principal Component Analysis)¶

In [11]:
class Arrow3D(FancyArrowPatch):
def __init__(self, xs, ys, zs, *args, **kwargs):
FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
self._verts3d = xs, ys, zs

def draw(self, renderer):
xs3d, ys3d, zs3d = self._verts3d
xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
FancyArrowPatch.draw(self, renderer)

# class 0:
# covariance matrix and mean
cov0 = np.array([[5.,  -2., 1.],
[-2., 3., 1.],
[1.,  1., 9.]])

mean0 = np.array([2.,-1.,3])

#cov0 = np.array([[5.,0.,0], [0., 1., 0], [0, 0, 3.]])
#mean0 = np.array([0.,0.,0.])
#number of data points
m0 = 1000

# generate m0 gaussian distributed data points with
# mean0 and cov0.
rd = np.random.multivariate_normal(mean0, cov0, m0)

mean = rd.mean(axis=0)
cov_mat = np.cov(rd, rowvar=0)

In [12]:
# note: if performance is an issue use the scipy implementation instead
eigenvalues, eigenvectors = np.linalg.eig(cov_mat)

# check if the eigen equation is valid: A * x == lambda * x
for eva, eve in zip(eigenvalues, eigenvectors.T):
np.testing.assert_array_almost_equal(cov_mat.dot(eve),
eva * eve,
decimal=6, err_msg='', verbose=True)
# the same, but compact in matrix form
np.testing.assert_array_almost_equal(cov_mat.dot(eigenvectors), eigenvalues * eigenvectors)

In [13]:

#scaled eigenvectors for visualization
sev = eigenvectors * np.sqrt(eigenvalues)

#from sklearn import decomposition
#pca = decomposition.PCA(n_components=3)
#pca.fit(rd)
#sev = pca.components_.T * 3.

In [14]:
#%matplotlib notebook
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(rd[...,0], rd[...,1], rd[...,2], c='b', marker='x', label="raw data")

for s in sev.T:
a = Arrow3D([mean[0], s[0]+mean[0]], [mean[1], s[1]+mean[1]], [mean[2], s[2]+mean[2]], mutation_scale=20, lw=3, arrowstyle="-|>", color="r")
ax.add_artist(a)

ax.set_xlabel('x_values')
ax.set_ylabel('y_values')
ax.set_zlabel('z_values')
ax.set_xlim(-9.,9.)
ax.set_ylim(-9.,9.)
ax.set_zlim(-9.,9.)
plt.title('Eigenvectors')
plt.draw()
plt.show()

In [15]:
#plt.close('all')
# the eigenvectors computed by numpy are normed to length 1
for ev in eigenvectors:
np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))


In [16]:


# sort the eigenvectors and eigenvalues
idx = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:,idx]

# Now we change the orthonormal basis of the coordinate system
# Note that the scalar product of two vectors can be interpreted as a projection
# of one of the vectors on the other.
rd_new_basis = (rd-mean).dot(eigenvectors)
# rd_new_basis are the data points in the new basis

In [17]:
# Plot the data in the new space:
#plt.close('all') # close all latent plotting windows

fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(rd_new_basis[...,0], rd_new_basis[...,1], rd_new_basis[...,2], c='b', marker='x', label="projected data")

for s in (5. * np.eye(3)):
print s, " "
a = Arrow3D([0., s[0]], [0., s[1]], [0., s[2]], mutation_scale=20, lw=3, arrowstyle="-|>", color="r")

ax.add_artist(a)

ax.set_xlim(-9.,9.)
ax.set_ylim(-9.,9.)
ax.set_zlim(-9.,9.)
ax.set_xlabel('new_x_values')
ax.set_ylabel('new_y_values')
ax.set_zlabel('new_z_values')

plt.title('Data in Eigenspace.')
plt.draw()
plt.show()

[ 5.  0.  0.]
[ 0.  5.  0.]
[ 0.  0.  5.]

In [18]:
%matplotlib inline
###
# So we can skip the last dim

rd_new_basis = (rd-mean).dot(eigenvectors[...,:2])

# Whitening
# scale by the inverse of the sqrt of the eigenvalues
rd_new_basis = rd_new_basis / np.sqrt(eigenvalues[:2])

plt.close('all') # close all latent plotting windows
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
ax.scatter(rd_new_basis[...,0], rd_new_basis[...,1], c='b', marker='x', label="projected data")
ax.set_xlim(-4.,4.)
ax.set_ylim(-4.,4.)
ax.set_xlabel('new x-values')
ax.set_ylabel('new y-values')
plt.draw()
plt.title("Whitened data")
plt.show()


### Using matplotlib's PCA¶

In [19]:
# same with matplot pca
plt.close('all') # close all latent plotting windows

from matplotlib.mlab import PCA
#construct your numpy array of data
results = PCA(rd)
#this will return an array of variance percentages for each component
results.fracs
#this will return a array of the data projected into PCA space
r = results.Y

fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(r[...,0], r[...,1], r[...,2], c='b', marker='x', label="raw data")
ax.set_xlim(-5.,5.)
ax.set_ylim(-5.,5.)
ax.set_zlim(-5.,5.)
ax.set_xlabel('x_values')
ax.set_ylabel('y_values')
ax.set_zlabel('z_values')
plt.title('Data in Eigenspace.')
plt.draw()
plt.show()


#### Assumptions of PCA¶

1. Linearity
2. Large variances have important structure - data has high signal noise ration (SNR)
3. Principal components are orthogonal

see [Shl]

For pca with scikit learn see