Linear Algebra for Machine Learning
Linear algebra is the backbone of machine learning. Most ML algorithms can be expressed as operations on vectors and matrices, making linear algebra essential for understanding and implementing these algorithms.
Vectors in Machine Learning
Vector Representation of Data
In ML, data points are typically represented as vectors:
import numpy as np
import matplotlib.pyplot as plt
# Example: House price prediction
# Features: [square_feet, bedrooms, bathrooms, age]
house1 = np.array([2000, 3, 2, 5]) # House 1
house2 = np.array([1500, 2, 1, 10]) # House 2
house3 = np.array([2500, 4, 3, 2]) # House 3
# Dataset as matrix (each row is a data point)
X = np.array([house1, house2, house3])
print("Dataset shape:", X.shape)
print("Dataset:\n", X)Vector Operations in ML
Dot Product and Similarity
def cosine_similarity(v1, v2):
"""Compute cosine similarity between two vectors"""
dot_product = np.dot(v1, v2)
norms = np.linalg.norm(v1) * np.linalg.norm(v2)
return dot_product / norms
# Example: Document similarity
doc1 = np.array([1, 2, 0, 1]) # Word frequencies
doc2 = np.array([2, 1, 1, 0]) # Word frequencies
doc3 = np.array([0, 0, 1, 2]) # Word frequencies
similarity_12 = cosine_similarity(doc1, doc2)
similarity_13 = cosine_similarity(doc1, doc3)
print(f"Similarity between doc1 and doc2: {similarity_12:.3f}")
print(f"Similarity between doc1 and doc3: {similarity_13:.3f}")Distance Metrics
def euclidean_distance(v1, v2):
"""Euclidean distance between vectors"""
return np.linalg.norm(v1 - v2)
def manhattan_distance(v1, v2):
"""Manhattan distance between vectors"""
return np.sum(np.abs(v1 - v2))
# Example: K-Nearest Neighbors
def knn_classify(X_train, y_train, x_test, k=3):
"""Simple KNN classifier"""
distances = [euclidean_distance(x_test, x_train) for x_train in X_train]
k_indices = np.argsort(distances)[:k]
k_labels = y_train[k_indices]
return np.bincount(k_labels).argmax()
# Sample data
X_train = np.array([[1, 2], [2, 3], [3, 1], [6, 5], [7, 7], [8, 6]])
y_train = np.array([0, 0, 0, 1, 1, 1]) # Class labels
x_test = np.array([4, 4])
predicted_class = knn_classify(X_train, y_train, x_test, k=3)
print(f"Predicted class for {x_test}: {predicted_class}")
# Visualize
plt.figure(figsize=(8, 6))
colors = ['red', 'blue']
for i in range(2):
mask = y_train == i
plt.scatter(X_train[mask, 0], X_train[mask, 1],
c=colors[i], label=f'Class {i}', s=100)
plt.scatter(x_test[0], x_test[1], c='green', marker='x', s=200, label='Test point')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.title('K-Nearest Neighbors Classification')
plt.grid(True)
plt.show()Matrices in Machine Learning
Data Representation
# Dataset matrix: rows = samples, columns = features
n_samples, n_features = 1000, 4
X = np.random.randn(n_samples, n_features)
y = np.random.randint(0, 2, n_samples)
print(f"Data matrix shape: {X.shape}")
print(f"Labels shape: {y.shape}")
# Feature statistics
print(f"Feature means: {np.mean(X, axis=0)}")
print(f"Feature std: {np.std(X, axis=0)}")Matrix Operations in Linear Regression
class LinearRegression:
def __init__(self):
self.weights = None
self.bias = None
def fit(self, X, y):
"""Fit linear regression using normal equation"""
# Add bias term (intercept)
X_with_bias = np.column_stack([np.ones(X.shape[0]), X])
# Normal equation: θ = (X^T X)^(-1) X^T y
XtX = X_with_bias.T @ X_with_bias
Xty = X_with_bias.T @ y
theta = np.linalg.solve(XtX, Xty)
self.bias = theta[0]
self.weights = theta[1:]
def predict(self, X):
"""Make predictions"""
return X @ self.weights + self.bias
def score(self, X, y):
"""R-squared score"""
y_pred = self.predict(X)
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
return 1 - (ss_res / ss_tot)
# Generate sample data
np.random.seed(42)
X = np.random.randn(100, 2)
true_weights = np.array([3, -2])
true_bias = 1
y = X @ true_weights + true_bias + 0.1 * np.random.randn(100)
# Fit model
model = LinearRegression()
model.fit(X, y)
print(f"True weights: {true_weights}")
print(f"Estimated weights: {model.weights}")
print(f"True bias: {true_bias}")
print(f"Estimated bias: {model.bias:.3f}")
print(f"R-squared: {model.score(X, y):.3f}")Matrix Factorization
Principal Component Analysis (PCA)
class PCA:
def __init__(self, n_components):
self.n_components = n_components
self.components = None
self.mean = None
self.explained_variance_ratio = None
def fit(self, X):
"""Fit PCA using eigendecomposition"""
# Center the data
self.mean = np.mean(X, axis=0)
X_centered = X - self.mean
# Compute covariance matrix
cov_matrix = np.cov(X_centered.T)
# Eigendecomposition
eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)
# Sort by eigenvalues (descending)
idx = np.argsort(eigenvalues)[::-1]
eigenvalues = eigenvalues[idx]
eigenvectors = eigenvectors[:, idx]
# Select top components
self.components = eigenvectors[:, :self.n_components].T
self.explained_variance_ratio = eigenvalues[:self.n_components] / np.sum(eigenvalues)
def transform(self, X):
"""Transform data to lower dimension"""
X_centered = X - self.mean
return X_centered @ self.components.T
def fit_transform(self, X):
"""Fit and transform in one step"""
self.fit(X)
return self.transform(X)
# Generate 2D data with correlation
np.random.seed(42)
mean = [0, 0]
cov = [[1, 0.8], [0.8, 1]]
X = np.random.multivariate_normal(mean, cov, 300)
# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Original data
ax1.scatter(X[:, 0], X[:, 1], alpha=0.6)
ax1.set_title('Original Data')
ax1.set_xlabel('Feature 1')
ax1.set_ylabel('Feature 2')
ax1.grid(True)
ax1.axis('equal')
# PCA transformed data
ax2.scatter(X_pca[:, 0], X_pca[:, 1], alpha=0.6)
ax2.set_title('PCA Transformed Data')
ax2.set_xlabel('First Principal Component')
ax2.set_ylabel('Second Principal Component')
ax2.grid(True)
ax2.axis('equal')
plt.tight_layout()
plt.show()
print(f"Explained variance ratio: {pca.explained_variance_ratio}")
print(f"Total variance explained: {np.sum(pca.explained_variance_ratio):.3f}")Eigenvalues and Eigenvectors
Understanding Eigendecomposition
def visualize_eigenvectors(A):
"""Visualize how matrix A transforms eigenvectors"""
# Compute eigenvalues and eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(A)
# Create unit circle
theta = np.linspace(0, 2*np.pi, 100)
unit_circle = np.array([np.cos(theta), np.sin(theta)])
# Transform unit circle
transformed_circle = A @ unit_circle
plt.figure(figsize=(12, 5))
# Original space
plt.subplot(1, 2, 1)
plt.plot(unit_circle[0], unit_circle[1], 'b-', label='Unit circle')
# Plot eigenvectors
for i, (val, vec) in enumerate(zip(eigenvalues, eigenvectors.T)):
plt.arrow(0, 0, vec[0], vec[1], head_width=0.1, head_length=0.1,
fc=f'C{i}', ec=f'C{i}', label=f'Eigenvector {i+1} (λ={val:.2f})')
plt.axis('equal')
plt.grid(True)
plt.legend()
plt.title('Original Space')
# Transformed space
plt.subplot(1, 2, 2)
plt.plot(transformed_circle[0], transformed_circle[1], 'r-', label='Transformed circle')
# Plot transformed eigenvectors
for i, (val, vec) in enumerate(zip(eigenvalues, eigenvectors.T)):
transformed_vec = A @ vec
plt.arrow(0, 0, transformed_vec[0], transformed_vec[1],
head_width=0.1, head_length=0.1,
fc=f'C{i}', ec=f'C{i}', label=f'Transformed eigenvector {i+1}')
plt.axis('equal')
plt.grid(True)
plt.legend()
plt.title('Transformed Space')
plt.tight_layout()
plt.show()
# Example matrix
A = np.array([[2, 1], [1, 2]])
visualize_eigenvectors(A)Spectral Clustering
def spectral_clustering(X, n_clusters=2, sigma=1.0):
"""Simple spectral clustering implementation"""
n_samples = X.shape[0]
# Compute similarity matrix (RBF kernel)
distances = np.sum(X**2, axis=1, keepdims=True) + np.sum(X**2, axis=1) - 2 * X @ X.T
similarity = np.exp(-distances / (2 * sigma**2))
# Compute degree matrix
degree = np.diag(np.sum(similarity, axis=1))
# Compute normalized Laplacian
D_inv_sqrt = np.diag(1.0 / np.sqrt(np.sum(similarity, axis=1)))
L_norm = np.eye(n_samples) - D_inv_sqrt @ similarity @ D_inv_sqrt
# Eigendecomposition
eigenvalues, eigenvectors = np.linalg.eigh(L_norm)
# Use smallest eigenvectors for clustering
embedding = eigenvectors[:, :n_clusters]
# K-means on embedding
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(embedding)
return labels, embedding
# Generate two moons dataset
from sklearn.datasets import make_moons
X, y_true = make_moons(n_samples=200, noise=0.1, random_state=42)
# Apply spectral clustering
labels_spectral, embedding = spectral_clustering(X, n_clusters=2, sigma=0.3)
# Visualize results
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Original data
axes[0].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis')
axes[0].set_title('True Clusters')
axes[0].grid(True)
# Spectral embedding
axes[1].scatter(embedding[:, 0], embedding[:, 1], c=y_true, cmap='viridis')
axes[1].set_title('Spectral Embedding')
axes[1].grid(True)
# Spectral clustering result
axes[2].scatter(X[:, 0], X[:, 1], c=labels_spectral, cmap='viridis')
axes[2].set_title('Spectral Clustering Result')
axes[2].grid(True)
plt.tight_layout()
plt.show()Matrix Calculus for Neural Networks
Gradient Computation
class SimpleNeuralNetwork:
def __init__(self, input_size, hidden_size, output_size):
# Initialize weights
self.W1 = np.random.randn(input_size, hidden_size) * 0.1
self.b1 = np.zeros((1, hidden_size))
self.W2 = np.random.randn(hidden_size, output_size) * 0.1
self.b2 = np.zeros((1, output_size))
def sigmoid(self, x):
return 1 / (1 + np.exp(-np.clip(x, -250, 250)))
def sigmoid_derivative(self, x):
s = self.sigmoid(x)
return s * (1 - s)
def forward(self, X):
"""Forward pass"""
self.z1 = X @ self.W1 + self.b1
self.a1 = self.sigmoid(self.z1)
self.z2 = self.a1 @ self.W2 + self.b2
self.a2 = self.sigmoid(self.z2)
return self.a2
def backward(self, X, y, output):
"""Backward pass using matrix calculus"""
m = X.shape[0]
# Output layer gradients
dz2 = output - y
dW2 = (1/m) * self.a1.T @ dz2
db2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
# Hidden layer gradients
da1 = dz2 @ self.W2.T
dz1 = da1 * self.sigmoid_derivative(self.z1)
dW1 = (1/m) * X.T @ dz1
db1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
return dW1, db1, dW2, db2
def train(self, X, y, epochs=1000, learning_rate=0.1):
"""Train the network"""
losses = []
for epoch in range(epochs):
# Forward pass
output = self.forward(X)
# Compute loss
loss = np.mean((output - y)**2)
losses.append(loss)
# Backward pass
dW1, db1, dW2, db2 = self.backward(X, y, output)
# Update weights
self.W1 -= learning_rate * dW1
self.b1 -= learning_rate * db1
self.W2 -= learning_rate * dW2
self.b2 -= learning_rate * db2
if epoch % 100 == 0:
print(f"Epoch {epoch}, Loss: {loss:.4f}")
return losses
# Generate XOR dataset
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])
# Create and train network
nn = SimpleNeuralNetwork(2, 4, 1)
losses = nn.train(X, y, epochs=5000, learning_rate=1.0)
# Test the network
predictions = nn.forward(X)
print("\nFinal predictions:")
for i in range(len(X)):
print(f"Input: {X[i]}, Target: {y[i][0]}, Prediction: {predictions[i][0]:.3f}")
# Plot training loss
plt.figure(figsize=(10, 6))
plt.plot(losses)
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Mean Squared Error')
plt.grid(True)
plt.show()Singular Value Decomposition (SVD)
Recommender Systems
def matrix_factorization_svd(R, k=10):
"""Matrix factorization using SVD for recommender systems"""
# R is user-item rating matrix
# Fill missing values with mean
R_filled = R.copy()
mask = ~np.isnan(R)
R_filled[~mask] = np.nanmean(R)
# Center the data
user_means = np.nanmean(R, axis=1, keepdims=True)
R_centered = R_filled - user_means
# SVD decomposition
U, s, Vt = np.linalg.svd(R_centered, full_matrices=False)
# Keep only top k components
U_k = U[:, :k]
s_k = s[:k]
Vt_k = Vt[:k, :]
# Reconstruct matrix
R_reconstructed = U_k @ np.diag(s_k) @ Vt_k + user_means
return R_reconstructed, U_k, s_k, Vt_k
# Create sample user-item rating matrix
np.random.seed(42)
n_users, n_items = 100, 50
R = np.random.rand(n_users, n_items) * 5
# Introduce missing values (80% sparsity)
mask = np.random.rand(n_users, n_items) < 0.8
R[mask] = np.nan
print(f"Original matrix shape: {R.shape}")
print(f"Sparsity: {np.sum(mask) / (n_users * n_items):.2%}")
# Apply matrix factorization
R_pred, U, s, Vt = matrix_factorization_svd(R, k=10)
# Compute RMSE on observed entries
observed_mask = ~np.isnan(R)
rmse = np.sqrt(np.mean((R[observed_mask] - R_pred[observed_mask])**2))
print(f"RMSE on observed entries: {rmse:.3f}")
# Visualize singular values
plt.figure(figsize=(10, 6))
plt.plot(s, 'bo-')
plt.title('Singular Values')
plt.xlabel('Component')
plt.ylabel('Singular Value')
plt.grid(True)
plt.show()Practical Applications
Image Compression using SVD
def compress_image_svd(image, k):
"""Compress image using SVD"""
if len(image.shape) == 3:
# Color image - compress each channel
compressed = np.zeros_like(image)
for i in range(3):
U, s, Vt = np.linalg.svd(image[:, :, i], full_matrices=False)
compressed[:, :, i] = U[:, :k] @ np.diag(s[:k]) @ Vt[:k, :]
else:
# Grayscale image
U, s, Vt = np.linalg.svd(image, full_matrices=False)
compressed = U[:, :k] @ np.diag(s[:k]) @ Vt[:k, :]
return np.clip(compressed, 0, 255).astype(np.uint8)
# Create a simple synthetic image
def create_test_image():
x = np.linspace(0, 4*np.pi, 100)
y = np.linspace(0, 4*np.pi, 100)
X, Y = np.meshgrid(x, y)
image = 128 + 127 * np.sin(X) * np.cos(Y)
return image.astype(np.uint8)
# Test image compression
original_image = create_test_image()
compression_ratios = [5, 10, 20, 50]
fig, axes = plt.subplots(1, len(compression_ratios) + 1, figsize=(15, 3))
# Original image
axes[0].imshow(original_image, cmap='gray')
axes[0].set_title('Original')
axes[0].axis('off')
# Compressed images
for i, k in enumerate(compression_ratios):
compressed = compress_image_svd(original_image, k)
axes[i+1].imshow(compressed, cmap='gray')
# Calculate compression ratio
original_size = original_image.size
compressed_size = k * (original_image.shape[0] + original_image.shape[1] + 1)
ratio = compressed_size / original_size
axes[i+1].set_title(f'k={k}\n({ratio:.1%} of original)')
axes[i+1].axis('off')
plt.tight_layout()
plt.show()Summary
Linear algebra is fundamental to machine learning because:
- Data Representation: Data is naturally represented as vectors and matrices
- Efficient Computation: Matrix operations enable vectorized computations
- Dimensionality Reduction: Techniques like PCA use eigendecomposition
- Neural Networks: Forward and backward passes are matrix multiplications
- Optimization: Gradient-based methods rely on matrix calculus
- Similarity and Distance: Vector operations measure data relationships
Understanding these concepts enables you to: - Implement ML algorithms from scratch - Debug and optimize existing implementations - Understand the mathematical foundations of modern AI systems - Design new algorithms and approaches