Gradient-Based Optimization Methods
Gradient-based methods are the backbone of modern machine learning and optimization. They use derivative information to iteratively find optimal solutions, making them essential for training neural networks and solving large-scale optimization problems.
Gradient Descent Fundamentals
Basic Gradient Descent
The gradient descent algorithm updates parameters in the direction of steepest descent:
x_{k+1} = x_k - α ∇f(x_k)
Where: - α is the learning rate (step size) - ∇f(x_k) is the gradient at point x_k
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def gradient_descent_basic(f, grad_f, x0, learning_rate=0.01, max_iterations=1000, tolerance=1e-6):
"""Basic gradient descent implementation"""
x = x0.copy()
history = [x.copy()]
for i in range(max_iterations):
gradient = grad_f(x)
x_new = x - learning_rate * gradient
# Check convergence
if np.linalg.norm(x_new - x) < tolerance:
print(f"Converged after {i+1} iterations")
break
x = x_new
history.append(x.copy())
return x, np.array(history)
# Example: Minimize f(x,y) = x² + 2y²
def quadratic_function(x):
return x[0]**2 + 2*x[1]**2
def quadratic_gradient(x):
return np.array([2*x[0], 4*x[1]])
# Optimize
x0 = np.array([3.0, 2.0])
optimal_x, history = gradient_descent_basic(quadratic_function, quadratic_gradient, x0, learning_rate=0.1)
print(f"Starting point: {x0}")
print(f"Optimal point: {optimal_x}")
print(f"Function value: {quadratic_function(optimal_x):.6f}")
# Visualize optimization path
x = np.linspace(-4, 4, 100)
y = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x, y)
Z = X**2 + 2*Y**2
plt.figure(figsize=(12, 5))
# 2D contour plot
plt.subplot(1, 2, 1)
contours = plt.contour(X, Y, Z, levels=20)
plt.plot(history[:, 0], history[:, 1], 'ro-', markersize=4, linewidth=2, label='Optimization path')
plt.plot(optimal_x[0], optimal_x[1], 'g*', markersize=15, label='Optimum')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Gradient Descent Path')
plt.legend()
plt.grid(True, alpha=0.3)
# 3D surface plot
ax = plt.subplot(1, 2, 2, projection='3d')
ax.plot_surface(X, Y, Z, alpha=0.6, cmap='viridis')
ax.plot(history[:, 0], history[:, 1], [quadratic_function(h) for h in history],
'ro-', markersize=4, linewidth=2, label='Optimization path')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('f(x,y)')
ax.set_title('3D Optimization Path')
plt.tight_layout()
plt.show()Learning Rate Analysis
def analyze_learning_rates():
"""Analyze the effect of different learning rates"""
learning_rates = [0.01, 0.1, 0.5, 0.9]
x0 = np.array([3.0, 2.0])
plt.figure(figsize=(15, 10))
for i, lr in enumerate(learning_rates):
optimal_x, history = gradient_descent_basic(
quadratic_function, quadratic_gradient, x0,
learning_rate=lr, max_iterations=50
)
plt.subplot(2, 2, i+1)
# Plot contours
x = np.linspace(-4, 4, 100)
y = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x, y)
Z = X**2 + 2*Y**2
plt.contour(X, Y, Z, levels=15, alpha=0.6)
# Plot optimization path
plt.plot(history[:, 0], history[:, 1], 'ro-', markersize=3, linewidth=1.5)
plt.plot(0, 0, 'g*', markersize=15, label='True optimum')
plt.xlabel('x')
plt.ylabel('y')
plt.title(f'Learning Rate = {lr}')
plt.grid(True, alpha=0.3)
plt.legend()
# Print convergence info
final_error = np.linalg.norm(optimal_x)
print(f"LR = {lr}: Final error = {final_error:.6f}, Iterations = {len(history)}")
plt.tight_layout()
plt.show()
analyze_learning_rates()Advanced Gradient Methods
Momentum
Momentum helps accelerate convergence and reduces oscillations:
v_{k+1} = β v_k + α ∇f(x_k) x_{k+1} = x_k - v_{k+1}
def gradient_descent_momentum(f, grad_f, x0, learning_rate=0.01, momentum=0.9,
max_iterations=1000, tolerance=1e-6):
"""Gradient descent with momentum"""
x = x0.copy()
velocity = np.zeros_like(x)
history = [x.copy()]
for i in range(max_iterations):
gradient = grad_f(x)
velocity = momentum * velocity + learning_rate * gradient
x_new = x - velocity
if np.linalg.norm(x_new - x) < tolerance:
print(f"Momentum GD converged after {i+1} iterations")
break
x = x_new
history.append(x.copy())
return x, np.array(history)
# Compare standard GD vs momentum
def compare_momentum():
"""Compare gradient descent with and without momentum"""
# Ill-conditioned quadratic: f(x,y) = 10x² + y²
def ill_conditioned_f(x):
return 10*x[0]**2 + x[1]**2
def ill_conditioned_grad(x):
return np.array([20*x[0], 2*x[1]])
x0 = np.array([2.0, 2.0])
# Standard gradient descent
opt_std, hist_std = gradient_descent_basic(
ill_conditioned_f, ill_conditioned_grad, x0,
learning_rate=0.05, max_iterations=100
)
# Gradient descent with momentum
opt_mom, hist_mom = gradient_descent_momentum(
ill_conditioned_f, ill_conditioned_grad, x0,
learning_rate=0.05, momentum=0.9, max_iterations=100
)
# Visualize comparison
plt.figure(figsize=(15, 5))
# Create contour plot
x = np.linspace(-3, 3, 100)
y = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x, y)
Z = 10*X**2 + Y**2
# Standard GD
plt.subplot(1, 3, 1)
plt.contour(X, Y, Z, levels=20)
plt.plot(hist_std[:, 0], hist_std[:, 1], 'ro-', markersize=3, label='Standard GD')
plt.plot(0, 0, 'g*', markersize=15, label='Optimum')
plt.title('Standard Gradient Descent')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True, alpha=0.3)
# Momentum GD
plt.subplot(1, 3, 2)
plt.contour(X, Y, Z, levels=20)
plt.plot(hist_mom[:, 0], hist_mom[:, 1], 'bo-', markersize=3, label='Momentum GD')
plt.plot(0, 0, 'g*', markersize=15, label='Optimum')
plt.title('Gradient Descent with Momentum')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True, alpha=0.3)
# Convergence comparison
plt.subplot(1, 3, 3)
std_errors = [np.linalg.norm(h) for h in hist_std]
mom_errors = [np.linalg.norm(h) for h in hist_mom]
plt.semilogy(std_errors, 'r-', label='Standard GD', linewidth=2)
plt.semilogy(mom_errors, 'b-', label='Momentum GD', linewidth=2)
plt.xlabel('Iteration')
plt.ylabel('Distance to Optimum (log scale)')
plt.title('Convergence Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Standard GD: {len(hist_std)} iterations, final error: {np.linalg.norm(opt_std):.6f}")
print(f"Momentum GD: {len(hist_mom)} iterations, final error: {np.linalg.norm(opt_mom):.6f}")
compare_momentum()Adaptive Learning Rates
AdaGrad
AdaGrad adapts the learning rate based on historical gradients:
def adagrad(f, grad_f, x0, learning_rate=0.1, epsilon=1e-8, max_iterations=1000, tolerance=1e-6):
"""AdaGrad optimizer"""
x = x0.copy()
G = np.zeros_like(x) # Accumulated squared gradients
history = [x.copy()]
for i in range(max_iterations):
gradient = grad_f(x)
G += gradient**2
# Adaptive learning rate
adapted_lr = learning_rate / (np.sqrt(G) + epsilon)
x_new = x - adapted_lr * gradient
if np.linalg.norm(x_new - x) < tolerance:
print(f"AdaGrad converged after {i+1} iterations")
break
x = x_new
history.append(x.copy())
return x, np.array(history)Adam (Adaptive Moment Estimation)
Adam combines momentum with adaptive learning rates:
def adam(f, grad_f, x0, learning_rate=0.001, beta1=0.9, beta2=0.999,
epsilon=1e-8, max_iterations=1000, tolerance=1e-6):
"""Adam optimizer"""
x = x0.copy()
m = np.zeros_like(x) # First moment estimate
v = np.zeros_like(x) # Second moment estimate
history = [x.copy()]
for i in range(max_iterations):
gradient = grad_f(x)
# Update biased first moment estimate
m = beta1 * m + (1 - beta1) * gradient
# Update biased second raw moment estimate
v = beta2 * v + (1 - beta2) * gradient**2
# Compute bias-corrected first moment estimate
m_hat = m / (1 - beta1**(i + 1))
# Compute bias-corrected second raw moment estimate
v_hat = v / (1 - beta2**(i + 1))
# Update parameters
x_new = x - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
if np.linalg.norm(x_new - x) < tolerance:
print(f"Adam converged after {i+1} iterations")
break
x = x_new
history.append(x.copy())
return x, np.array(history)
def compare_optimizers():
"""Compare different optimization algorithms"""
# Rosenbrock function: f(x,y) = (a-x)² + b(y-x²)²
def rosenbrock(x, a=1, b=100):
return (a - x[0])**2 + b * (x[1] - x[0]**2)**2
def rosenbrock_grad(x, a=1, b=100):
dx = -2*(a - x[0]) - 4*b*x[0]*(x[1] - x[0]**2)
dy = 2*b*(x[1] - x[0]**2)
return np.array([dx, dy])
x0 = np.array([-1.0, 1.0])
# Run different optimizers
optimizers = {
'GD': lambda: gradient_descent_basic(rosenbrock, rosenbrock_grad, x0, 0.001, 2000),
'Momentum': lambda: gradient_descent_momentum(rosenbrock, rosenbrock_grad, x0, 0.001, 0.9, 2000),
'AdaGrad': lambda: adagrad(rosenbrock, rosenbrock_grad, x0, 0.1, max_iterations=2000),
'Adam': lambda: adam(rosenbrock, rosenbrock_grad, x0, 0.01, max_iterations=2000)
}
results = {}
for name, optimizer in optimizers.items():
opt_x, history = optimizer()
results[name] = {
'optimum': opt_x,
'history': history,
'final_value': rosenbrock(opt_x)
}
# Visualize results
plt.figure(figsize=(15, 10))
# Create Rosenbrock function contour
x = np.linspace(-2, 2, 100)
y = np.linspace(-1, 3, 100)
X, Y = np.meshgrid(x, y)
Z = (1 - X)**2 + 100 * (Y - X**2)**2
colors = ['red', 'blue', 'green', 'orange']
for i, (name, result) in enumerate(results.items()):
plt.subplot(2, 2, i+1)
plt.contour(X, Y, Z, levels=np.logspace(-1, 3, 20))
history = result['history']
plt.plot(history[:, 0], history[:, 1], 'o-', color=colors[i],
markersize=2, linewidth=1, alpha=0.7, label=name)
plt.plot(1, 1, 'r*', markersize=15, label='Global minimum')
plt.xlabel('x')
plt.ylabel('y')
plt.title(f'{name} Optimizer')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# Print results summary
print("Optimizer Comparison Results:")
print("=" * 50)
print(f"{'Optimizer':>10} {'Iterations':>12} {'Final Value':>15} {'Distance to Opt':>15}")
print("-" * 50)
for name, result in results.items():
iterations = len(result['history'])
final_value = result['final_value']
distance = np.linalg.norm(result['optimum'] - np.array([1, 1]))
print(f"{name:>10} {iterations:>12} {final_value:>15.6f} {distance:>15.6f}")
compare_optimizers()Stochastic Gradient Descent
Mini-batch SGD
For large datasets, we use mini-batch stochastic gradient descent:
def stochastic_gradient_descent(X, y, loss_fn, grad_fn, theta0,
learning_rate=0.01, batch_size=32,
epochs=100, shuffle=True):
"""Mini-batch stochastic gradient descent"""
theta = theta0.copy()
n_samples = X.shape[0]
history = {'loss': [], 'theta': []}
for epoch in range(epochs):
# Shuffle data
if shuffle:
indices = np.random.permutation(n_samples)
X_shuffled = X[indices]
y_shuffled = y[indices]
else:
X_shuffled, y_shuffled = X, y
epoch_loss = 0
n_batches = 0
# Process mini-batches
for i in range(0, n_samples, batch_size):
batch_X = X_shuffled[i:i+batch_size]
batch_y = y_shuffled[i:i+batch_size]
# Compute gradient on mini-batch
gradient = grad_fn(theta, batch_X, batch_y)
# Update parameters
theta = theta - learning_rate * gradient
# Track loss
batch_loss = loss_fn(theta, batch_X, batch_y)
epoch_loss += batch_loss
n_batches += 1
# Record history
avg_loss = epoch_loss / n_batches
history['loss'].append(avg_loss)
history['theta'].append(theta.copy())
if epoch % 10 == 0:
print(f"Epoch {epoch}: Loss = {avg_loss:.6f}")
return theta, history
# Example: Linear regression with SGD
def linear_regression_sgd_example():
"""Linear regression using SGD"""
# Generate synthetic data
np.random.seed(42)
n_samples, n_features = 1000, 5
X = np.random.randn(n_samples, n_features)
true_theta = np.random.randn(n_features)
y = X @ true_theta + 0.1 * np.random.randn(n_samples)
# Add bias term
X = np.column_stack([np.ones(n_samples), X])
true_theta = np.concatenate([[0], true_theta])
# Define loss and gradient functions
def mse_loss(theta, X, y):
predictions = X @ theta
return np.mean((predictions - y)**2)
def mse_gradient(theta, X, y):
predictions = X @ theta
return 2 * X.T @ (predictions - y) / len(y)
# Initialize parameters
theta0 = np.random.randn(X.shape[1]) * 0.1
# Run SGD
theta_sgd, history = stochastic_gradient_descent(
X, y, mse_loss, mse_gradient, theta0,
learning_rate=0.01, batch_size=32, epochs=100
)
# Compare with analytical solution
theta_analytical = np.linalg.solve(X.T @ X, X.T @ y)
print(f"\nResults Comparison:")
print(f"True parameters: {true_theta}")
print(f"SGD parameters: {theta_sgd}")
print(f"Analytical solution: {theta_analytical}")
print(f"SGD error: {np.linalg.norm(theta_sgd - true_theta):.6f}")
print(f"Analytical error: {np.linalg.norm(theta_analytical - true_theta):.6f}")
# Plot convergence
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history['loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('SGD Convergence')
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
theta_history = np.array(history['theta'])
for i in range(len(true_theta)):
plt.plot(theta_history[:, i], label=f'θ_{i}')
plt.axhline(true_theta[i], color=f'C{i}', linestyle='--', alpha=0.7)
plt.xlabel('Epoch')
plt.ylabel('Parameter Value')
plt.title('Parameter Evolution')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
linear_regression_sgd_example()Second-Order Methods
Newton’s Method
Newton’s method uses second-order information (Hessian matrix):
x_{k+1} = x_k - H^{-1}(x_k) ∇f(x_k)
def newtons_method(f, grad_f, hessian_f, x0, max_iterations=100, tolerance=1e-6):
"""Newton's method for optimization"""
x = x0.copy()
history = [x.copy()]
for i in range(max_iterations):
gradient = grad_f(x)
hessian = hessian_f(x)
# Check if Hessian is positive definite
eigenvals = np.linalg.eigvals(hessian)
if np.any(eigenvals <= 0):
print(f"Warning: Hessian not positive definite at iteration {i}")
# Newton step
try:
newton_step = np.linalg.solve(hessian, gradient)
x_new = x - newton_step
except np.linalg.LinAlgError:
print(f"Singular Hessian at iteration {i}")
break
if np.linalg.norm(x_new - x) < tolerance:
print(f"Newton's method converged after {i+1} iterations")
break
x = x_new
history.append(x.copy())
return x, np.array(history)
# Example: Compare Newton's method with gradient descent
def compare_newton_gd():
"""Compare Newton's method with gradient descent"""
# Quadratic function with Hessian
def quad_hessian(x):
return np.array([[2, 0], [0, 4]])
x0 = np.array([3.0, 2.0])
# Newton's method
opt_newton, hist_newton = newtons_method(
quadratic_function, quadratic_gradient, quad_hessian, x0
)
# Gradient descent
opt_gd, hist_gd = gradient_descent_basic(
quadratic_function, quadratic_gradient, x0, learning_rate=0.1
)
# Visualize comparison
plt.figure(figsize=(15, 5))
# Create contour plot
x = np.linspace(-4, 4, 100)
y = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x, y)
Z = X**2 + 2*Y**2
# Newton's method
plt.subplot(1, 3, 1)
plt.contour(X, Y, Z, levels=20)
plt.plot(hist_newton[:, 0], hist_newton[:, 1], 'ro-', markersize=6, linewidth=2, label="Newton's Method")
plt.plot(0, 0, 'g*', markersize=15, label='Optimum')
plt.title("Newton's Method")
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True, alpha=0.3)
# Gradient descent
plt.subplot(1, 3, 2)
plt.contour(X, Y, Z, levels=20)
plt.plot(hist_gd[:, 0], hist_gd[:, 1], 'bo-', markersize=3, linewidth=1, label='Gradient Descent')
plt.plot(0, 0, 'g*', markersize=15, label='Optimum')
plt.title('Gradient Descent')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True, alpha=0.3)
# Convergence comparison
plt.subplot(1, 3, 3)
newton_errors = [np.linalg.norm(h) for h in hist_newton]
gd_errors = [np.linalg.norm(h) for h in hist_gd]
plt.semilogy(newton_errors, 'ro-', label="Newton's Method", linewidth=2)
plt.semilogy(gd_errors, 'bo-', label='Gradient Descent', linewidth=2)
plt.xlabel('Iteration')
plt.ylabel('Distance to Optimum (log scale)')
plt.title('Convergence Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"Newton's method: {len(hist_newton)} iterations")
print(f"Gradient descent: {len(hist_gd)} iterations")
print(f"Newton's method final error: {np.linalg.norm(opt_newton):.10f}")
print(f"Gradient descent final error: {np.linalg.norm(opt_gd):.10f}")
compare_newton_gd()Quasi-Newton Methods (BFGS)
BFGS approximates the Hessian using gradient information:
def bfgs(f, grad_f, x0, max_iterations=100, tolerance=1e-6):
"""BFGS quasi-Newton method"""
x = x0.copy()
n = len(x)
B = np.eye(n) # Initial Hessian approximation
history = [x.copy()]
gradient = grad_f(x)
for i in range(max_iterations):
# Solve B * p = -gradient for search direction p
p = -np.linalg.solve(B, gradient)
# Line search (simplified - use fixed step size)
alpha = 1.0
x_new = x + alpha * p
gradient_new = grad_f(x_new)
# Check convergence
if np.linalg.norm(gradient_new) < tolerance:
print(f"BFGS converged after {i+1} iterations")
break
# BFGS update
s = x_new - x
y = gradient_new - gradient
# Avoid division by zero
if np.dot(s, y) > 1e-10:
rho = 1.0 / np.dot(s, y)
I = np.eye(n)
B = (I - rho * np.outer(s, y)) @ B @ (I - rho * np.outer(y, s)) + rho * np.outer(s, s)
x = x_new
gradient = gradient_new
history.append(x.copy())
return x, np.array(history)
# Compare BFGS with other methods
def compare_all_methods():
"""Compare all optimization methods"""
x0 = np.array([3.0, 2.0])
methods = {
'Gradient Descent': lambda: gradient_descent_basic(quadratic_function, quadratic_gradient, x0, 0.1),
'Newton': lambda: newtons_method(quadratic_function, quadratic_gradient,
lambda x: np.array([[2, 0], [0, 4]]), x0),
'BFGS': lambda: bfgs(quadratic_function, quadratic_gradient, x0),
'Adam': lambda: adam(quadratic_function, quadratic_gradient, x0, 0.1)
}
plt.figure(figsize=(12, 8))
colors = ['red', 'blue', 'green', 'orange']
# Create contour plot
x = np.linspace(-4, 4, 100)
y = np.linspace(-3, 3, 100)
X, Y = np.meshgrid(x, y)
Z = X**2 + 2*Y**2
plt.contour(X, Y, Z, levels=15, alpha=0.6)
for i, (name, method) in enumerate(methods.items()):
opt_x, history = method()
plt.plot(history[:, 0], history[:, 1], 'o-', color=colors[i],
markersize=4, linewidth=2, label=f'{name} ({len(history)} iter)')
plt.plot(0, 0, 'k*', markersize=15, label='Optimum')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Optimization Methods Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
compare_all_methods()Applications in Machine Learning
Logistic Regression
def logistic_regression_optimization():
"""Optimize logistic regression using different methods"""
# Generate binary classification data
np.random.seed(42)
n_samples, n_features = 1000, 2
X = np.random.randn(n_samples, n_features)
true_w = np.array([1.5, -2.0])
true_b = 0.5
# Generate labels
logits = X @ true_w + true_b
probabilities = 1 / (1 + np.exp(-logits))
y = np.random.binomial(1, probabilities)
# Add bias term
X_with_bias = np.column_stack([np.ones(n_samples), X])
true_params = np.array([true_b, true_w[0], true_w[1]])
# Logistic regression loss and gradient
def logistic_loss(params, X, y):
logits = X @ params
return np.mean(np.log(1 + np.exp(logits)) - y * logits)
def logistic_gradient(params, X, y):
logits = X @ params
probabilities = 1 / (1 + np.exp(-logits))
return X.T @ (probabilities - y) / len(y)
# Initialize parameters
params0 = np.random.randn(X_with_bias.shape[1]) * 0.1
# Optimize using different methods
methods = {
'SGD': lambda: stochastic_gradient_descent(
X_with_bias, y, logistic_loss, logistic_gradient, params0,
learning_rate=0.1, batch_size=64, epochs=100
),
'Adam': lambda: adam(
lambda p: logistic_loss(p, X_with_bias, y),
lambda p: logistic_gradient(p, X_with_bias, y),
params0, learning_rate=0.01, max_iterations=1000
)
}
results = {}
for name, method in methods.items():
if name == 'SGD':
params_opt, history = method()
loss_history = history['loss']
else:
params_opt, param_history = method()
loss_history = [logistic_loss(p, X_with_bias, y) for p in param_history]
results[name] = {
'params': params_opt,
'loss_history': loss_history,
'final_loss': logistic_loss(params_opt, X_with_bias, y)
}
# Visualize results
plt.figure(figsize=(15, 5))
# Decision boundary visualization
plt.subplot(1, 3, 1)
# Plot data points
plt.scatter(X[y==0, 0], X[y==0, 1], c='red', marker='o', alpha=0.6, label='Class 0')
plt.scatter(X[y==1, 0], X[y==1, 1], c='blue', marker='s', alpha=0.6, label='Class 1')
# Plot true decision boundary
x_boundary = np.linspace(-3, 3, 100)
y_boundary = -(true_params[0] + true_params[1] * x_boundary) / true_params[2]
plt.plot(x_boundary, y_boundary, 'g--', linewidth=2, label='True boundary')
# Plot learned decision boundary (using Adam result)
adam_params = results['Adam']['params']
y_learned = -(adam_params[0] + adam_params[1] * x_boundary) / adam_params[2]
plt.plot(x_boundary, y_learned, 'k-', linewidth=2, label='Learned boundary')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Logistic Regression Decision Boundary')
plt.legend()
plt.grid(True, alpha=0.3)
# Loss convergence
plt.subplot(1, 3, 2)
for name, result in results.items():
plt.plot(result['loss_history'], label=f'{name}', linewidth=2)
plt.xlabel('Iteration/Epoch')
plt.ylabel('Logistic Loss')
plt.title('Loss Convergence')
plt.legend()
plt.grid(True, alpha=0.3)
# Parameter convergence
plt.subplot(1, 3, 3)
print("Parameter Comparison:")
print(f"True parameters: {true_params}")
for name, result in results.items():
params = result['params']
error = np.linalg.norm(params - true_params)
print(f"{name} parameters: {params}")
print(f"{name} error: {error:.6f}")
plt.bar(range(len(params)), params, alpha=0.7, label=f'{name}')
plt.bar(range(len(true_params)), true_params, alpha=0.7,
color='black', label='True', width=0.3)
plt.xlabel('Parameter Index')
plt.ylabel('Parameter Value')
plt.title('Parameter Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
logistic_regression_optimization()Summary
Gradient-based optimization methods are essential for:
- Machine Learning: Training neural networks and other models
- Parameter Estimation: Finding optimal model parameters
- Function Minimization: Solving continuous optimization problems
- Large-Scale Optimization: Handling high-dimensional problems
Key insights: - Learning rate is crucial for convergence - Momentum helps accelerate convergence and reduce oscillations - Adaptive methods (Adam, AdaGrad) automatically adjust learning rates - Second-order methods converge faster but require more computation - Stochastic methods are essential for large datasets
The choice of optimization method depends on: - Problem size and computational budget - Gradient availability and computational cost - Convergence requirements - Noise in the objective function
Understanding these methods enables you to select appropriate optimizers for different machine learning and optimization tasks.