PCA

Contents

PCA#

Principal Component Analysis (PCA) is commonly used for dimensionality reduction and visualization.

Change of Base:

Transforms the data by changing its basis to highlight key patterns or components.

import matplotlib.pyplot as plt
import numpy as np
np.random.seed(42)
plt.figure(figsize=(5,5))
x = np.linspace(-1,1,20)
noise = np.random.randn(20)/3
y = 2*x+noise
y = y-np.mean(y)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(np.stack([x,y]).T)
m1 = pca.components_[0,1]/pca.components_[0,0]
m2 = pca.components_[1,1]/pca.components_[1,0]
y1,y2 = m1*x, m2*x
plt.scatter(x,y,c='g');
plt.plot(x,y1, 'r--', label='PC1')
plt.plot(x,y2, 'b--', label='PC2')
plt.xlim(-2,2)
plt.ylim(-2,2)
plt.grid()
plt.legend();

_images/4fa1e1ec2cb7407e0d46c2348d02659407c26f483f5dbee67c5dbc15a45400c5.png

Cancer Dataset#

from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)

f_names = load_breast_cancer().feature_names

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# apply min_max_scaler to training and test set
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

PCA and dim=10#

# 10 pc
from sklearn.decomposition import PCA
pca_10 = PCA(n_components=10)
pca_10.fit(X_train_scaled)

PCA(n_components=10)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# explained_variance_ratio_
pca_10.explained_variance_ratio_

array([0.51834856, 0.18411207, 0.07009027, 0.0628725 , 0.03973014,
       0.03298706, 0.01540735, 0.01327304, 0.0105223 , 0.00970247])

# cumsum
pca_10.explained_variance_ratio_.cumsum()

array([0.51834856, 0.70246063, 0.7725509 , 0.8354234 , 0.87515355,
       0.90814061, 0.92354796, 0.936821  , 0.9473433 , 0.95704577])

# plot cumsum
import matplotlib.pyplot as plt
x_l = ['PC'+str(i) for i in range(1,11)   ]
plt.plot(x_l, pca_10.explained_variance_ratio_.cumsum())
plt.xlabel('Principal Components')
plt.ylabel('Cumulative Variance Ratio')
plt.grid()

_images/549213a6dc360cd02cb83803f9411d9caaedd96a6a06df7d58b92ceb7263026b.png

# PC1
plt.figure(figsize=(15,4))
plt.bar(f_names, pca_10.components_[0,:])
plt.title('PC1')
plt.xticks(rotation=90);

_images/c064f7e0064b8401c0dff470da5074baf961c2de3e21d3e55918a45771398a02.png

# PC2
plt.figure(figsize=(15,4))
plt.bar(f_names, pca_10.components_[1,:])
plt.title('PC2')
plt.xticks(rotation=90);

_images/3e68b8d35ddced839e5a9b69a8c65151c85f282665f4b970f4b9266a5f971d8d.png

PCA and dim=2#

# 2 pc
pca_2 = PCA(n_components=2)
pca_2.fit(X_train_scaled)

PCA(n_components=2)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# transform
X_pca_2 = pca_2.transform(X_train_scaled)
X_pca_2.shape

(426, 2)

# plot transformed data
import seaborn as sns
sns.scatterplot( x = X_pca_2[:,0], y  = X_pca_2[:,1]   )
plt.xlabel('PC1', fontsize=20)
plt.ylabel('PC2', fontsize=20);

_images/1163da8e5cc35185ba682276dd423ce9f05023068339d14684748a2d113504a0.png

# plot transformed data with hue
plt.figure(figsize=(5,5))
sns.scatterplot( x = X_pca_2[:,0], y  = X_pca_2[:,1], hue = y_train, palette ='Set2'   )
plt.xlabel('PC1', fontsize=20)
plt.ylabel('PC2', fontsize=20);

_images/cb1b13c0971d525fdc6c639a5559fc07cd2033d72d07b1dbaffdea8fd9283604.png

Hybrid Model kNN and PCA#

# knn with 3 neighbors applied to the original data
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train_scaled, y_train)
knn.score(X_train_scaled, y_train), knn.score(X_test_scaled, y_test)

(0.9906103286384976, 0.972027972027972)

# PCA with dim=2
knn.fit(X_pca_2, y_train)
knn.score(X_pca_2, y_train), knn.score(pca_2.transform(X_test_scaled), y_test)

(0.9624413145539906, 0.958041958041958)