Linear Algebra for Machine Learning
Linear algebra is the backbone of machine learning. Most ML algorithms can be expressed as operations on vectors and matrices, making linear algebra essential for understanding and implementing these algorithms.
Vectors in Machine Learning
Vector Representation of Data
In ML, data points are typically represented as vectors:
import numpy as np
import matplotlib.pyplot as plt
# Example: House price prediction
# Features: [square_feet, bedrooms, bathrooms, age]
= np.array([2000, 3, 2, 5]) # House 1
house1 = np.array([1500, 2, 1, 10]) # House 2
house2 = np.array([2500, 4, 3, 2]) # House 3
house3
# Dataset as matrix (each row is a data point)
= np.array([house1, house2, house3])
X print("Dataset shape:", X.shape)
print("Dataset:\n", X)
Vector Operations in ML
Dot Product and Similarity
def cosine_similarity(v1, v2):
"""Compute cosine similarity between two vectors"""
= np.dot(v1, v2)
dot_product = np.linalg.norm(v1) * np.linalg.norm(v2)
norms return dot_product / norms
# Example: Document similarity
= np.array([1, 2, 0, 1]) # Word frequencies
doc1 = np.array([2, 1, 1, 0]) # Word frequencies
doc2 = np.array([0, 0, 1, 2]) # Word frequencies
doc3
= cosine_similarity(doc1, doc2)
similarity_12 = cosine_similarity(doc1, doc3)
similarity_13
print(f"Similarity between doc1 and doc2: {similarity_12:.3f}")
print(f"Similarity between doc1 and doc3: {similarity_13:.3f}")
Distance Metrics
def euclidean_distance(v1, v2):
"""Euclidean distance between vectors"""
return np.linalg.norm(v1 - v2)
def manhattan_distance(v1, v2):
"""Manhattan distance between vectors"""
return np.sum(np.abs(v1 - v2))
# Example: K-Nearest Neighbors
def knn_classify(X_train, y_train, x_test, k=3):
"""Simple KNN classifier"""
= [euclidean_distance(x_test, x_train) for x_train in X_train]
distances = np.argsort(distances)[:k]
k_indices = y_train[k_indices]
k_labels return np.bincount(k_labels).argmax()
# Sample data
= np.array([[1, 2], [2, 3], [3, 1], [6, 5], [7, 7], [8, 6]])
X_train = np.array([0, 0, 0, 1, 1, 1]) # Class labels
y_train = np.array([4, 4])
x_test
= knn_classify(X_train, y_train, x_test, k=3)
predicted_class print(f"Predicted class for {x_test}: {predicted_class}")
# Visualize
=(8, 6))
plt.figure(figsize= ['red', 'blue']
colors for i in range(2):
= y_train == i
mask 0], X_train[mask, 1],
plt.scatter(X_train[mask, =colors[i], label=f'Class {i}', s=100)
c
0], x_test[1], c='green', marker='x', s=200, label='Test point')
plt.scatter(x_test['Feature 1')
plt.xlabel('Feature 2')
plt.ylabel(
plt.legend()'K-Nearest Neighbors Classification')
plt.title(True)
plt.grid( plt.show()
Matrices in Machine Learning
Data Representation
# Dataset matrix: rows = samples, columns = features
= 1000, 4
n_samples, n_features = np.random.randn(n_samples, n_features)
X = np.random.randint(0, 2, n_samples)
y
print(f"Data matrix shape: {X.shape}")
print(f"Labels shape: {y.shape}")
# Feature statistics
print(f"Feature means: {np.mean(X, axis=0)}")
print(f"Feature std: {np.std(X, axis=0)}")
Matrix Operations in Linear Regression
class LinearRegression:
def __init__(self):
self.weights = None
self.bias = None
def fit(self, X, y):
"""Fit linear regression using normal equation"""
# Add bias term (intercept)
= np.column_stack([np.ones(X.shape[0]), X])
X_with_bias
# Normal equation: θ = (X^T X)^(-1) X^T y
= X_with_bias.T @ X_with_bias
XtX = X_with_bias.T @ y
Xty = np.linalg.solve(XtX, Xty)
theta
self.bias = theta[0]
self.weights = theta[1:]
def predict(self, X):
"""Make predictions"""
return X @ self.weights + self.bias
def score(self, X, y):
"""R-squared score"""
= self.predict(X)
y_pred = np.sum((y - y_pred) ** 2)
ss_res = np.sum((y - np.mean(y)) ** 2)
ss_tot return 1 - (ss_res / ss_tot)
# Generate sample data
42)
np.random.seed(= np.random.randn(100, 2)
X = np.array([3, -2])
true_weights = 1
true_bias = X @ true_weights + true_bias + 0.1 * np.random.randn(100)
y
# Fit model
= LinearRegression()
model
model.fit(X, y)
print(f"True weights: {true_weights}")
print(f"Estimated weights: {model.weights}")
print(f"True bias: {true_bias}")
print(f"Estimated bias: {model.bias:.3f}")
print(f"R-squared: {model.score(X, y):.3f}")
Matrix Factorization
Principal Component Analysis (PCA)
class PCA:
def __init__(self, n_components):
self.n_components = n_components
self.components = None
self.mean = None
self.explained_variance_ratio = None
def fit(self, X):
"""Fit PCA using eigendecomposition"""
# Center the data
self.mean = np.mean(X, axis=0)
= X - self.mean
X_centered
# Compute covariance matrix
= np.cov(X_centered.T)
cov_matrix
# Eigendecomposition
= np.linalg.eigh(cov_matrix)
eigenvalues, eigenvectors
# Sort by eigenvalues (descending)
= np.argsort(eigenvalues)[::-1]
idx = eigenvalues[idx]
eigenvalues = eigenvectors[:, idx]
eigenvectors
# Select top components
self.components = eigenvectors[:, :self.n_components].T
self.explained_variance_ratio = eigenvalues[:self.n_components] / np.sum(eigenvalues)
def transform(self, X):
"""Transform data to lower dimension"""
= X - self.mean
X_centered return X_centered @ self.components.T
def fit_transform(self, X):
"""Fit and transform in one step"""
self.fit(X)
return self.transform(X)
# Generate 2D data with correlation
42)
np.random.seed(= [0, 0]
mean = [[1, 0.8], [0.8, 1]]
cov = np.random.multivariate_normal(mean, cov, 300)
X
# Apply PCA
= PCA(n_components=2)
pca = pca.fit_transform(X)
X_pca
# Visualize
= plt.subplots(1, 2, figsize=(12, 5))
fig, (ax1, ax2)
# Original data
0], X[:, 1], alpha=0.6)
ax1.scatter(X[:, 'Original Data')
ax1.set_title('Feature 1')
ax1.set_xlabel('Feature 2')
ax1.set_ylabel(True)
ax1.grid('equal')
ax1.axis(
# PCA transformed data
0], X_pca[:, 1], alpha=0.6)
ax2.scatter(X_pca[:, 'PCA Transformed Data')
ax2.set_title('First Principal Component')
ax2.set_xlabel('Second Principal Component')
ax2.set_ylabel(True)
ax2.grid('equal')
ax2.axis(
plt.tight_layout()
plt.show()
print(f"Explained variance ratio: {pca.explained_variance_ratio}")
print(f"Total variance explained: {np.sum(pca.explained_variance_ratio):.3f}")
Eigenvalues and Eigenvectors
Understanding Eigendecomposition
def visualize_eigenvectors(A):
"""Visualize how matrix A transforms eigenvectors"""
# Compute eigenvalues and eigenvectors
= np.linalg.eig(A)
eigenvalues, eigenvectors
# Create unit circle
= np.linspace(0, 2*np.pi, 100)
theta = np.array([np.cos(theta), np.sin(theta)])
unit_circle
# Transform unit circle
= A @ unit_circle
transformed_circle
=(12, 5))
plt.figure(figsize
# Original space
1, 2, 1)
plt.subplot(0], unit_circle[1], 'b-', label='Unit circle')
plt.plot(unit_circle[
# Plot eigenvectors
for i, (val, vec) in enumerate(zip(eigenvalues, eigenvectors.T)):
0, 0, vec[0], vec[1], head_width=0.1, head_length=0.1,
plt.arrow(=f'C{i}', ec=f'C{i}', label=f'Eigenvector {i+1} (λ={val:.2f})')
fc
'equal')
plt.axis(True)
plt.grid(
plt.legend()'Original Space')
plt.title(
# Transformed space
1, 2, 2)
plt.subplot(0], transformed_circle[1], 'r-', label='Transformed circle')
plt.plot(transformed_circle[
# Plot transformed eigenvectors
for i, (val, vec) in enumerate(zip(eigenvalues, eigenvectors.T)):
= A @ vec
transformed_vec 0, 0, transformed_vec[0], transformed_vec[1],
plt.arrow(=0.1, head_length=0.1,
head_width=f'C{i}', ec=f'C{i}', label=f'Transformed eigenvector {i+1}')
fc
'equal')
plt.axis(True)
plt.grid(
plt.legend()'Transformed Space')
plt.title(
plt.tight_layout()
plt.show()
# Example matrix
= np.array([[2, 1], [1, 2]])
A visualize_eigenvectors(A)
Spectral Clustering
def spectral_clustering(X, n_clusters=2, sigma=1.0):
"""Simple spectral clustering implementation"""
= X.shape[0]
n_samples
# Compute similarity matrix (RBF kernel)
= np.sum(X**2, axis=1, keepdims=True) + np.sum(X**2, axis=1) - 2 * X @ X.T
distances = np.exp(-distances / (2 * sigma**2))
similarity
# Compute degree matrix
= np.diag(np.sum(similarity, axis=1))
degree
# Compute normalized Laplacian
= np.diag(1.0 / np.sqrt(np.sum(similarity, axis=1)))
D_inv_sqrt = np.eye(n_samples) - D_inv_sqrt @ similarity @ D_inv_sqrt
L_norm
# Eigendecomposition
= np.linalg.eigh(L_norm)
eigenvalues, eigenvectors
# Use smallest eigenvectors for clustering
= eigenvectors[:, :n_clusters]
embedding
# K-means on embedding
from sklearn.cluster import KMeans
= KMeans(n_clusters=n_clusters, random_state=42)
kmeans = kmeans.fit_predict(embedding)
labels
return labels, embedding
# Generate two moons dataset
from sklearn.datasets import make_moons
= make_moons(n_samples=200, noise=0.1, random_state=42)
X, y_true
# Apply spectral clustering
= spectral_clustering(X, n_clusters=2, sigma=0.3)
labels_spectral, embedding
# Visualize results
= plt.subplots(1, 3, figsize=(15, 5))
fig, axes
# Original data
0].scatter(X[:, 0], X[:, 1], c=y_true, cmap='viridis')
axes[0].set_title('True Clusters')
axes[0].grid(True)
axes[
# Spectral embedding
1].scatter(embedding[:, 0], embedding[:, 1], c=y_true, cmap='viridis')
axes[1].set_title('Spectral Embedding')
axes[1].grid(True)
axes[
# Spectral clustering result
2].scatter(X[:, 0], X[:, 1], c=labels_spectral, cmap='viridis')
axes[2].set_title('Spectral Clustering Result')
axes[2].grid(True)
axes[
plt.tight_layout() plt.show()
Matrix Calculus for Neural Networks
Gradient Computation
class SimpleNeuralNetwork:
def __init__(self, input_size, hidden_size, output_size):
# Initialize weights
self.W1 = np.random.randn(input_size, hidden_size) * 0.1
self.b1 = np.zeros((1, hidden_size))
self.W2 = np.random.randn(hidden_size, output_size) * 0.1
self.b2 = np.zeros((1, output_size))
def sigmoid(self, x):
return 1 / (1 + np.exp(-np.clip(x, -250, 250)))
def sigmoid_derivative(self, x):
= self.sigmoid(x)
s return s * (1 - s)
def forward(self, X):
"""Forward pass"""
self.z1 = X @ self.W1 + self.b1
self.a1 = self.sigmoid(self.z1)
self.z2 = self.a1 @ self.W2 + self.b2
self.a2 = self.sigmoid(self.z2)
return self.a2
def backward(self, X, y, output):
"""Backward pass using matrix calculus"""
= X.shape[0]
m
# Output layer gradients
= output - y
dz2 = (1/m) * self.a1.T @ dz2
dW2 = (1/m) * np.sum(dz2, axis=0, keepdims=True)
db2
# Hidden layer gradients
= dz2 @ self.W2.T
da1 = da1 * self.sigmoid_derivative(self.z1)
dz1 = (1/m) * X.T @ dz1
dW1 = (1/m) * np.sum(dz1, axis=0, keepdims=True)
db1
return dW1, db1, dW2, db2
def train(self, X, y, epochs=1000, learning_rate=0.1):
"""Train the network"""
= []
losses
for epoch in range(epochs):
# Forward pass
= self.forward(X)
output
# Compute loss
= np.mean((output - y)**2)
loss
losses.append(loss)
# Backward pass
= self.backward(X, y, output)
dW1, db1, dW2, db2
# Update weights
self.W1 -= learning_rate * dW1
self.b1 -= learning_rate * db1
self.W2 -= learning_rate * dW2
self.b2 -= learning_rate * db2
if epoch % 100 == 0:
print(f"Epoch {epoch}, Loss: {loss:.4f}")
return losses
# Generate XOR dataset
= np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
X = np.array([[0], [1], [1], [0]])
y
# Create and train network
= SimpleNeuralNetwork(2, 4, 1)
nn = nn.train(X, y, epochs=5000, learning_rate=1.0)
losses
# Test the network
= nn.forward(X)
predictions print("\nFinal predictions:")
for i in range(len(X)):
print(f"Input: {X[i]}, Target: {y[i][0]}, Prediction: {predictions[i][0]:.3f}")
# Plot training loss
=(10, 6))
plt.figure(figsize
plt.plot(losses)'Training Loss')
plt.title('Epoch')
plt.xlabel('Mean Squared Error')
plt.ylabel(True)
plt.grid( plt.show()
Singular Value Decomposition (SVD)
Recommender Systems
def matrix_factorization_svd(R, k=10):
"""Matrix factorization using SVD for recommender systems"""
# R is user-item rating matrix
# Fill missing values with mean
= R.copy()
R_filled = ~np.isnan(R)
mask ~mask] = np.nanmean(R)
R_filled[
# Center the data
= np.nanmean(R, axis=1, keepdims=True)
user_means = R_filled - user_means
R_centered
# SVD decomposition
= np.linalg.svd(R_centered, full_matrices=False)
U, s, Vt
# Keep only top k components
= U[:, :k]
U_k = s[:k]
s_k = Vt[:k, :]
Vt_k
# Reconstruct matrix
= U_k @ np.diag(s_k) @ Vt_k + user_means
R_reconstructed
return R_reconstructed, U_k, s_k, Vt_k
# Create sample user-item rating matrix
42)
np.random.seed(= 100, 50
n_users, n_items = np.random.rand(n_users, n_items) * 5
R
# Introduce missing values (80% sparsity)
= np.random.rand(n_users, n_items) < 0.8
mask = np.nan
R[mask]
print(f"Original matrix shape: {R.shape}")
print(f"Sparsity: {np.sum(mask) / (n_users * n_items):.2%}")
# Apply matrix factorization
= matrix_factorization_svd(R, k=10)
R_pred, U, s, Vt
# Compute RMSE on observed entries
= ~np.isnan(R)
observed_mask = np.sqrt(np.mean((R[observed_mask] - R_pred[observed_mask])**2))
rmse print(f"RMSE on observed entries: {rmse:.3f}")
# Visualize singular values
=(10, 6))
plt.figure(figsize'bo-')
plt.plot(s, 'Singular Values')
plt.title('Component')
plt.xlabel('Singular Value')
plt.ylabel(True)
plt.grid( plt.show()
Practical Applications
Image Compression using SVD
def compress_image_svd(image, k):
"""Compress image using SVD"""
if len(image.shape) == 3:
# Color image - compress each channel
= np.zeros_like(image)
compressed for i in range(3):
= np.linalg.svd(image[:, :, i], full_matrices=False)
U, s, Vt = U[:, :k] @ np.diag(s[:k]) @ Vt[:k, :]
compressed[:, :, i] else:
# Grayscale image
= np.linalg.svd(image, full_matrices=False)
U, s, Vt = U[:, :k] @ np.diag(s[:k]) @ Vt[:k, :]
compressed
return np.clip(compressed, 0, 255).astype(np.uint8)
# Create a simple synthetic image
def create_test_image():
= np.linspace(0, 4*np.pi, 100)
x = np.linspace(0, 4*np.pi, 100)
y = np.meshgrid(x, y)
X, Y = 128 + 127 * np.sin(X) * np.cos(Y)
image return image.astype(np.uint8)
# Test image compression
= create_test_image()
original_image = [5, 10, 20, 50]
compression_ratios
= plt.subplots(1, len(compression_ratios) + 1, figsize=(15, 3))
fig, axes
# Original image
0].imshow(original_image, cmap='gray')
axes[0].set_title('Original')
axes[0].axis('off')
axes[
# Compressed images
for i, k in enumerate(compression_ratios):
= compress_image_svd(original_image, k)
compressed +1].imshow(compressed, cmap='gray')
axes[i
# Calculate compression ratio
= original_image.size
original_size = k * (original_image.shape[0] + original_image.shape[1] + 1)
compressed_size = compressed_size / original_size
ratio
+1].set_title(f'k={k}\n({ratio:.1%} of original)')
axes[i+1].axis('off')
axes[i
plt.tight_layout() plt.show()
Summary
Linear algebra is fundamental to machine learning because:
- Data Representation: Data is naturally represented as vectors and matrices
- Efficient Computation: Matrix operations enable vectorized computations
- Dimensionality Reduction: Techniques like PCA use eigendecomposition
- Neural Networks: Forward and backward passes are matrix multiplications
- Optimization: Gradient-based methods rely on matrix calculus
- Similarity and Distance: Vector operations measure data relationships
Understanding these concepts enables you to: - Implement ML algorithms from scratch - Debug and optimize existing implementations - Understand the mathematical foundations of modern AI systems - Design new algorithms and approaches