import numpy as np
#from numpy import *
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.patches import FancyArrowPatch
from mpl_toolkits.mplot3d import proj3d
%matplotlib inline
Eigen equation:
$$ W \vec x = \lambda \vec x $$cov = np.array([[2.0, 0.8], [0.8, 0.6]])
mean = np.array([1., 1.])
rd = np.random.multivariate_normal(mean, cov, 50)
def plot_data():
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)
ax.scatter(rd[...,0], rd[...,1], c='b', marker='x', label="data")
ax.set_xlim(-2.,4.)
ax.set_ylim(-2.,4.)
ax.set_xlabel('x_values')
ax.set_ylabel('y_values')
plot_data()
mean_empirical = rd.mean(axis=0)
# here you see that the empirical mean is different from the "true" mean.
mean_empirical
n = rd.shape[0]
# definition
cov_mat = 1./(n-1)*(rd - mean_empirical).T.dot((rd - mean_empirical))
cov_mat_np = np.cov(rd, rowvar=0)
np.testing.assert_array_almost_equal(cov_mat, cov_mat_np)
x = np.array([1.,-1.])
x = x/np.linalg.norm(x)
x = cov_mat.dot(x)
x = x/np.linalg.norm(x)
x = np.array([1.,-1.])
x = x/np.linalg.norm(x)
Repeated multiplication of a (renormed) vector with the covariance matrix:
plt.xlim(-1.1,1.1)
plt.ylim(-1.1,1.1)
for i in range(10):
plt.arrow(0,0, x[0], x[1], alpha=0.5)
x = cov_mat.dot(x)
x = x/np.linalg.norm(x)
The vector rotates to a "fix point" vector. The fix point vector is the eigen vector corresponding to the largest eingenvalue.
eig_val, eig_vector_matrix = np.linalg.eig(cov_mat)
eig_vector_1 = eig_vector_matrix[:,0]
eig_val_1 = eig_val[0]
eig_vector_2 = eig_vector_matrix[:,1]
eig_val_2 = eig_val[1]
eig_vector_1
From the eigen equation: $$ \frac{W \vec x}{\lambda} = \vec x $$
x = eig_vector_1
print x
# this don't changes x:
x = cov_mat.dot(x)
x = x/eig_val[0]
print x
For visualization take the square root of the eigenvalues: in the new corrdinate system the cov-matrix is diagonal. the eigenvalues correspond to the variances. For visualization the standard deviation is more appropriate, so we take the square root of the variance.
plot_data()
s1 = np.sqrt(eig_val_1)
s2 = np.sqrt(eig_val_2)
plt.arrow(mean_empirical[0],mean_empirical[1], s1 * eig_vector_1[0], s1 * eig_vector_1[1], alpha=1., color='r')
plt.arrow(mean_empirical[0],mean_empirical[1], s2 * eig_vector_2[0], s2 * eig_vector_2[1], alpha=1., color='g')
class Arrow3D(FancyArrowPatch):
def __init__(self, xs, ys, zs, *args, **kwargs):
FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
self._verts3d = xs, ys, zs
def draw(self, renderer):
xs3d, ys3d, zs3d = self._verts3d
xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
FancyArrowPatch.draw(self, renderer)
# class 0:
# covariance matrix and mean
cov0 = np.array([[5., -2., 1.],
[-2., 3., 1.],
[1., 1., 9.]])
mean0 = np.array([2.,-1.,3])
#cov0 = np.array([[5.,0.,0], [0., 1., 0], [0, 0, 3.]])
#mean0 = np.array([0.,0.,0.])
#number of data points
m0 = 1000
# generate m0 gaussian distributed data points with
# mean0 and cov0.
rd = np.random.multivariate_normal(mean0, cov0, m0)
mean = rd.mean(axis=0)
cov_mat = np.cov(rd, rowvar=0)
# note: if performance is an issue use the scipy implementation instead
eigenvalues, eigenvectors = np.linalg.eig(cov_mat)
# check if the eigen equation is valid: A * x == lambda * x
for eva, eve in zip(eigenvalues, eigenvectors.T):
np.testing.assert_array_almost_equal(cov_mat.dot(eve),
eva * eve,
decimal=6, err_msg='', verbose=True)
# the same, but compact in matrix form
np.testing.assert_array_almost_equal(cov_mat.dot(eigenvectors), eigenvalues * eigenvectors)
#scaled eigenvectors for visualization
sev = eigenvectors * np.sqrt(eigenvalues)
#from sklearn import decomposition
#pca = decomposition.PCA(n_components=3)
#pca.fit(rd)
#sev = pca.components_.T * 3.
#%matplotlib notebook
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(rd[...,0], rd[...,1], rd[...,2], c='b', marker='x', label="raw data")
for s in sev.T:
a = Arrow3D([mean[0], s[0]+mean[0]], [mean[1], s[1]+mean[1]], [mean[2], s[2]+mean[2]], mutation_scale=20, lw=3, arrowstyle="-|>", color="r")
ax.add_artist(a)
ax.set_xlabel('x_values')
ax.set_ylabel('y_values')
ax.set_zlabel('z_values')
ax.set_xlim(-9.,9.)
ax.set_ylim(-9.,9.)
ax.set_zlim(-9.,9.)
plt.title('Eigenvectors')
plt.draw()
plt.show()
#plt.close('all')
# the eigenvectors computed by numpy are normed to length 1
for ev in eigenvectors:
np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
# sort the eigenvectors and eigenvalues
idx = eigenvalues.argsort()[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:,idx]
# Now we change the orthonormal basis of the coordinate system
# Note that the scalar product of two vectors can be interpreted as a projection
# of one of the vectors on the other.
rd_new_basis = (rd-mean).dot(eigenvectors)
# rd_new_basis are the data points in the new basis
# Plot the data in the new space:
#plt.close('all') # close all latent plotting windows
fig = plt.figure(figsize=(12,12))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(rd_new_basis[...,0], rd_new_basis[...,1], rd_new_basis[...,2], c='b', marker='x', label="projected data")
for s in (5. * np.eye(3)):
print s, " "
a = Arrow3D([0., s[0]], [0., s[1]], [0., s[2]], mutation_scale=20, lw=3, arrowstyle="-|>", color="r")
ax.add_artist(a)
ax.set_xlim(-9.,9.)
ax.set_ylim(-9.,9.)
ax.set_zlim(-9.,9.)
ax.set_xlabel('new_x_values')
ax.set_ylabel('new_y_values')
ax.set_zlabel('new_z_values')
plt.title('Data in Eigenspace.')
plt.draw()
plt.show()
%matplotlib inline
###
# So we can skip the last dim
rd_new_basis = (rd-mean).dot(eigenvectors[...,:2])
# Whitening
# scale by the inverse of the sqrt of the eigenvalues
rd_new_basis = rd_new_basis / np.sqrt(eigenvalues[:2])
plt.close('all') # close all latent plotting windows
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111)
ax.scatter(rd_new_basis[...,0], rd_new_basis[...,1], c='b', marker='x', label="projected data")
ax.set_xlim(-4.,4.)
ax.set_ylim(-4.,4.)
ax.set_xlabel('new x-values')
ax.set_ylabel('new y-values')
plt.draw()
plt.title("Whitened data")
plt.show()
# same with matplot pca
plt.close('all') # close all latent plotting windows
from matplotlib.mlab import PCA
#construct your numpy array of data
results = PCA(rd)
#this will return an array of variance percentages for each component
results.fracs
#this will return a array of the data projected into PCA space
r = results.Y
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(r[...,0], r[...,1], r[...,2], c='b', marker='x', label="raw data")
ax.set_xlim(-5.,5.)
ax.set_ylim(-5.,5.)
ax.set_zlim(-5.,5.)
ax.set_xlabel('x_values')
ax.set_ylabel('y_values')
ax.set_zlabel('z_values')
plt.title('Data in Eigenspace.')
plt.draw()
plt.show()
see [Shl]
For pca with scikit learn see