Gradient-Based Optimization Methods
Gradient-based methods are the backbone of modern machine learning and optimization. They use derivative information to iteratively find optimal solutions, making them essential for training neural networks and solving large-scale optimization problems.
Gradient Descent Fundamentals
Basic Gradient Descent
The gradient descent algorithm updates parameters in the direction of steepest descent:
x_{k+1} = x_k - α ∇f(x_k)
Where: - α is the learning rate (step size) - ∇f(x_k) is the gradient at point x_k
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def gradient_descent_basic(f, grad_f, x0, learning_rate=0.01, max_iterations=1000, tolerance=1e-6):
"""Basic gradient descent implementation"""
= x0.copy()
x = [x.copy()]
history
for i in range(max_iterations):
= grad_f(x)
gradient = x - learning_rate * gradient
x_new
# Check convergence
if np.linalg.norm(x_new - x) < tolerance:
print(f"Converged after {i+1} iterations")
break
= x_new
x
history.append(x.copy())
return x, np.array(history)
# Example: Minimize f(x,y) = x² + 2y²
def quadratic_function(x):
return x[0]**2 + 2*x[1]**2
def quadratic_gradient(x):
return np.array([2*x[0], 4*x[1]])
# Optimize
= np.array([3.0, 2.0])
x0 = gradient_descent_basic(quadratic_function, quadratic_gradient, x0, learning_rate=0.1)
optimal_x, history
print(f"Starting point: {x0}")
print(f"Optimal point: {optimal_x}")
print(f"Function value: {quadratic_function(optimal_x):.6f}")
# Visualize optimization path
= np.linspace(-4, 4, 100)
x = np.linspace(-3, 3, 100)
y = np.meshgrid(x, y)
X, Y = X**2 + 2*Y**2
Z
=(12, 5))
plt.figure(figsize
# 2D contour plot
1, 2, 1)
plt.subplot(= plt.contour(X, Y, Z, levels=20)
contours 0], history[:, 1], 'ro-', markersize=4, linewidth=2, label='Optimization path')
plt.plot(history[:, 0], optimal_x[1], 'g*', markersize=15, label='Optimum')
plt.plot(optimal_x['x')
plt.xlabel('y')
plt.ylabel('Gradient Descent Path')
plt.title(
plt.legend()True, alpha=0.3)
plt.grid(
# 3D surface plot
= plt.subplot(1, 2, 2, projection='3d')
ax =0.6, cmap='viridis')
ax.plot_surface(X, Y, Z, alpha0], history[:, 1], [quadratic_function(h) for h in history],
ax.plot(history[:, 'ro-', markersize=4, linewidth=2, label='Optimization path')
'x')
ax.set_xlabel('y')
ax.set_ylabel('f(x,y)')
ax.set_zlabel('3D Optimization Path')
ax.set_title(
plt.tight_layout() plt.show()
Learning Rate Analysis
def analyze_learning_rates():
"""Analyze the effect of different learning rates"""
= [0.01, 0.1, 0.5, 0.9]
learning_rates = np.array([3.0, 2.0])
x0
=(15, 10))
plt.figure(figsize
for i, lr in enumerate(learning_rates):
= gradient_descent_basic(
optimal_x, history
quadratic_function, quadratic_gradient, x0,=lr, max_iterations=50
learning_rate
)
2, 2, i+1)
plt.subplot(
# Plot contours
= np.linspace(-4, 4, 100)
x = np.linspace(-3, 3, 100)
y = np.meshgrid(x, y)
X, Y = X**2 + 2*Y**2
Z =15, alpha=0.6)
plt.contour(X, Y, Z, levels
# Plot optimization path
0], history[:, 1], 'ro-', markersize=3, linewidth=1.5)
plt.plot(history[:, 0, 0, 'g*', markersize=15, label='True optimum')
plt.plot('x')
plt.xlabel('y')
plt.ylabel(f'Learning Rate = {lr}')
plt.title(True, alpha=0.3)
plt.grid(
plt.legend()
# Print convergence info
= np.linalg.norm(optimal_x)
final_error print(f"LR = {lr}: Final error = {final_error:.6f}, Iterations = {len(history)}")
plt.tight_layout()
plt.show()
analyze_learning_rates()
Advanced Gradient Methods
Momentum
Momentum helps accelerate convergence and reduces oscillations:
v_{k+1} = β v_k + α ∇f(x_k) x_{k+1} = x_k - v_{k+1}
def gradient_descent_momentum(f, grad_f, x0, learning_rate=0.01, momentum=0.9,
=1000, tolerance=1e-6):
max_iterations"""Gradient descent with momentum"""
= x0.copy()
x = np.zeros_like(x)
velocity = [x.copy()]
history
for i in range(max_iterations):
= grad_f(x)
gradient = momentum * velocity + learning_rate * gradient
velocity = x - velocity
x_new
if np.linalg.norm(x_new - x) < tolerance:
print(f"Momentum GD converged after {i+1} iterations")
break
= x_new
x
history.append(x.copy())
return x, np.array(history)
# Compare standard GD vs momentum
def compare_momentum():
"""Compare gradient descent with and without momentum"""
# Ill-conditioned quadratic: f(x,y) = 10x² + y²
def ill_conditioned_f(x):
return 10*x[0]**2 + x[1]**2
def ill_conditioned_grad(x):
return np.array([20*x[0], 2*x[1]])
= np.array([2.0, 2.0])
x0
# Standard gradient descent
= gradient_descent_basic(
opt_std, hist_std
ill_conditioned_f, ill_conditioned_grad, x0,=0.05, max_iterations=100
learning_rate
)
# Gradient descent with momentum
= gradient_descent_momentum(
opt_mom, hist_mom
ill_conditioned_f, ill_conditioned_grad, x0,=0.05, momentum=0.9, max_iterations=100
learning_rate
)
# Visualize comparison
=(15, 5))
plt.figure(figsize
# Create contour plot
= np.linspace(-3, 3, 100)
x = np.linspace(-3, 3, 100)
y = np.meshgrid(x, y)
X, Y = 10*X**2 + Y**2
Z
# Standard GD
1, 3, 1)
plt.subplot(=20)
plt.contour(X, Y, Z, levels0], hist_std[:, 1], 'ro-', markersize=3, label='Standard GD')
plt.plot(hist_std[:, 0, 0, 'g*', markersize=15, label='Optimum')
plt.plot('Standard Gradient Descent')
plt.title('x')
plt.xlabel('y')
plt.ylabel(
plt.legend()True, alpha=0.3)
plt.grid(
# Momentum GD
1, 3, 2)
plt.subplot(=20)
plt.contour(X, Y, Z, levels0], hist_mom[:, 1], 'bo-', markersize=3, label='Momentum GD')
plt.plot(hist_mom[:, 0, 0, 'g*', markersize=15, label='Optimum')
plt.plot('Gradient Descent with Momentum')
plt.title('x')
plt.xlabel('y')
plt.ylabel(
plt.legend()True, alpha=0.3)
plt.grid(
# Convergence comparison
1, 3, 3)
plt.subplot(= [np.linalg.norm(h) for h in hist_std]
std_errors = [np.linalg.norm(h) for h in hist_mom]
mom_errors
'r-', label='Standard GD', linewidth=2)
plt.semilogy(std_errors, 'b-', label='Momentum GD', linewidth=2)
plt.semilogy(mom_errors, 'Iteration')
plt.xlabel('Distance to Optimum (log scale)')
plt.ylabel('Convergence Comparison')
plt.title(
plt.legend()True, alpha=0.3)
plt.grid(
plt.tight_layout()
plt.show()
print(f"Standard GD: {len(hist_std)} iterations, final error: {np.linalg.norm(opt_std):.6f}")
print(f"Momentum GD: {len(hist_mom)} iterations, final error: {np.linalg.norm(opt_mom):.6f}")
compare_momentum()
Adaptive Learning Rates
AdaGrad
AdaGrad adapts the learning rate based on historical gradients:
def adagrad(f, grad_f, x0, learning_rate=0.1, epsilon=1e-8, max_iterations=1000, tolerance=1e-6):
"""AdaGrad optimizer"""
= x0.copy()
x = np.zeros_like(x) # Accumulated squared gradients
G = [x.copy()]
history
for i in range(max_iterations):
= grad_f(x)
gradient += gradient**2
G
# Adaptive learning rate
= learning_rate / (np.sqrt(G) + epsilon)
adapted_lr = x - adapted_lr * gradient
x_new
if np.linalg.norm(x_new - x) < tolerance:
print(f"AdaGrad converged after {i+1} iterations")
break
= x_new
x
history.append(x.copy())
return x, np.array(history)
Adam (Adaptive Moment Estimation)
Adam combines momentum with adaptive learning rates:
def adam(f, grad_f, x0, learning_rate=0.001, beta1=0.9, beta2=0.999,
=1e-8, max_iterations=1000, tolerance=1e-6):
epsilon"""Adam optimizer"""
= x0.copy()
x = np.zeros_like(x) # First moment estimate
m = np.zeros_like(x) # Second moment estimate
v = [x.copy()]
history
for i in range(max_iterations):
= grad_f(x)
gradient
# Update biased first moment estimate
= beta1 * m + (1 - beta1) * gradient
m
# Update biased second raw moment estimate
= beta2 * v + (1 - beta2) * gradient**2
v
# Compute bias-corrected first moment estimate
= m / (1 - beta1**(i + 1))
m_hat
# Compute bias-corrected second raw moment estimate
= v / (1 - beta2**(i + 1))
v_hat
# Update parameters
= x - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
x_new
if np.linalg.norm(x_new - x) < tolerance:
print(f"Adam converged after {i+1} iterations")
break
= x_new
x
history.append(x.copy())
return x, np.array(history)
def compare_optimizers():
"""Compare different optimization algorithms"""
# Rosenbrock function: f(x,y) = (a-x)² + b(y-x²)²
def rosenbrock(x, a=1, b=100):
return (a - x[0])**2 + b * (x[1] - x[0]**2)**2
def rosenbrock_grad(x, a=1, b=100):
= -2*(a - x[0]) - 4*b*x[0]*(x[1] - x[0]**2)
dx = 2*b*(x[1] - x[0]**2)
dy return np.array([dx, dy])
= np.array([-1.0, 1.0])
x0
# Run different optimizers
= {
optimizers 'GD': lambda: gradient_descent_basic(rosenbrock, rosenbrock_grad, x0, 0.001, 2000),
'Momentum': lambda: gradient_descent_momentum(rosenbrock, rosenbrock_grad, x0, 0.001, 0.9, 2000),
'AdaGrad': lambda: adagrad(rosenbrock, rosenbrock_grad, x0, 0.1, max_iterations=2000),
'Adam': lambda: adam(rosenbrock, rosenbrock_grad, x0, 0.01, max_iterations=2000)
}
= {}
results for name, optimizer in optimizers.items():
= optimizer()
opt_x, history = {
results[name] 'optimum': opt_x,
'history': history,
'final_value': rosenbrock(opt_x)
}
# Visualize results
=(15, 10))
plt.figure(figsize
# Create Rosenbrock function contour
= np.linspace(-2, 2, 100)
x = np.linspace(-1, 3, 100)
y = np.meshgrid(x, y)
X, Y = (1 - X)**2 + 100 * (Y - X**2)**2
Z
= ['red', 'blue', 'green', 'orange']
colors
for i, (name, result) in enumerate(results.items()):
2, 2, i+1)
plt.subplot(=np.logspace(-1, 3, 20))
plt.contour(X, Y, Z, levels
= result['history']
history 0], history[:, 1], 'o-', color=colors[i],
plt.plot(history[:, =2, linewidth=1, alpha=0.7, label=name)
markersize1, 1, 'r*', markersize=15, label='Global minimum')
plt.plot(
'x')
plt.xlabel('y')
plt.ylabel(f'{name} Optimizer')
plt.title(
plt.legend()True, alpha=0.3)
plt.grid(
plt.tight_layout()
plt.show()
# Print results summary
print("Optimizer Comparison Results:")
print("=" * 50)
print(f"{'Optimizer':>10} {'Iterations':>12} {'Final Value':>15} {'Distance to Opt':>15}")
print("-" * 50)
for name, result in results.items():
= len(result['history'])
iterations = result['final_value']
final_value = np.linalg.norm(result['optimum'] - np.array([1, 1]))
distance print(f"{name:>10} {iterations:>12} {final_value:>15.6f} {distance:>15.6f}")
compare_optimizers()
Stochastic Gradient Descent
Mini-batch SGD
For large datasets, we use mini-batch stochastic gradient descent:
def stochastic_gradient_descent(X, y, loss_fn, grad_fn, theta0,
=0.01, batch_size=32,
learning_rate=100, shuffle=True):
epochs"""Mini-batch stochastic gradient descent"""
= theta0.copy()
theta = X.shape[0]
n_samples = {'loss': [], 'theta': []}
history
for epoch in range(epochs):
# Shuffle data
if shuffle:
= np.random.permutation(n_samples)
indices = X[indices]
X_shuffled = y[indices]
y_shuffled else:
= X, y
X_shuffled, y_shuffled
= 0
epoch_loss = 0
n_batches
# Process mini-batches
for i in range(0, n_samples, batch_size):
= X_shuffled[i:i+batch_size]
batch_X = y_shuffled[i:i+batch_size]
batch_y
# Compute gradient on mini-batch
= grad_fn(theta, batch_X, batch_y)
gradient
# Update parameters
= theta - learning_rate * gradient
theta
# Track loss
= loss_fn(theta, batch_X, batch_y)
batch_loss += batch_loss
epoch_loss += 1
n_batches
# Record history
= epoch_loss / n_batches
avg_loss 'loss'].append(avg_loss)
history['theta'].append(theta.copy())
history[
if epoch % 10 == 0:
print(f"Epoch {epoch}: Loss = {avg_loss:.6f}")
return theta, history
# Example: Linear regression with SGD
def linear_regression_sgd_example():
"""Linear regression using SGD"""
# Generate synthetic data
42)
np.random.seed(= 1000, 5
n_samples, n_features = np.random.randn(n_samples, n_features)
X = np.random.randn(n_features)
true_theta = X @ true_theta + 0.1 * np.random.randn(n_samples)
y
# Add bias term
= np.column_stack([np.ones(n_samples), X])
X = np.concatenate([[0], true_theta])
true_theta
# Define loss and gradient functions
def mse_loss(theta, X, y):
= X @ theta
predictions return np.mean((predictions - y)**2)
def mse_gradient(theta, X, y):
= X @ theta
predictions return 2 * X.T @ (predictions - y) / len(y)
# Initialize parameters
= np.random.randn(X.shape[1]) * 0.1
theta0
# Run SGD
= stochastic_gradient_descent(
theta_sgd, history
X, y, mse_loss, mse_gradient, theta0,=0.01, batch_size=32, epochs=100
learning_rate
)
# Compare with analytical solution
= np.linalg.solve(X.T @ X, X.T @ y)
theta_analytical
print(f"\nResults Comparison:")
print(f"True parameters: {true_theta}")
print(f"SGD parameters: {theta_sgd}")
print(f"Analytical solution: {theta_analytical}")
print(f"SGD error: {np.linalg.norm(theta_sgd - true_theta):.6f}")
print(f"Analytical error: {np.linalg.norm(theta_analytical - true_theta):.6f}")
# Plot convergence
=(12, 5))
plt.figure(figsize
1, 2, 1)
plt.subplot('loss'])
plt.plot(history['Epoch')
plt.xlabel('Loss')
plt.ylabel('SGD Convergence')
plt.title(True, alpha=0.3)
plt.grid(
1, 2, 2)
plt.subplot(= np.array(history['theta'])
theta_history for i in range(len(true_theta)):
=f'θ_{i}')
plt.plot(theta_history[:, i], label=f'C{i}', linestyle='--', alpha=0.7)
plt.axhline(true_theta[i], color
'Epoch')
plt.xlabel('Parameter Value')
plt.ylabel('Parameter Evolution')
plt.title(
plt.legend()True, alpha=0.3)
plt.grid(
plt.tight_layout()
plt.show()
linear_regression_sgd_example()
Second-Order Methods
Newton’s Method
Newton’s method uses second-order information (Hessian matrix):
x_{k+1} = x_k - H^{-1}(x_k) ∇f(x_k)
def newtons_method(f, grad_f, hessian_f, x0, max_iterations=100, tolerance=1e-6):
"""Newton's method for optimization"""
= x0.copy()
x = [x.copy()]
history
for i in range(max_iterations):
= grad_f(x)
gradient = hessian_f(x)
hessian
# Check if Hessian is positive definite
= np.linalg.eigvals(hessian)
eigenvals if np.any(eigenvals <= 0):
print(f"Warning: Hessian not positive definite at iteration {i}")
# Newton step
try:
= np.linalg.solve(hessian, gradient)
newton_step = x - newton_step
x_new except np.linalg.LinAlgError:
print(f"Singular Hessian at iteration {i}")
break
if np.linalg.norm(x_new - x) < tolerance:
print(f"Newton's method converged after {i+1} iterations")
break
= x_new
x
history.append(x.copy())
return x, np.array(history)
# Example: Compare Newton's method with gradient descent
def compare_newton_gd():
"""Compare Newton's method with gradient descent"""
# Quadratic function with Hessian
def quad_hessian(x):
return np.array([[2, 0], [0, 4]])
= np.array([3.0, 2.0])
x0
# Newton's method
= newtons_method(
opt_newton, hist_newton
quadratic_function, quadratic_gradient, quad_hessian, x0
)
# Gradient descent
= gradient_descent_basic(
opt_gd, hist_gd =0.1
quadratic_function, quadratic_gradient, x0, learning_rate
)
# Visualize comparison
=(15, 5))
plt.figure(figsize
# Create contour plot
= np.linspace(-4, 4, 100)
x = np.linspace(-3, 3, 100)
y = np.meshgrid(x, y)
X, Y = X**2 + 2*Y**2
Z
# Newton's method
1, 3, 1)
plt.subplot(=20)
plt.contour(X, Y, Z, levels0], hist_newton[:, 1], 'ro-', markersize=6, linewidth=2, label="Newton's Method")
plt.plot(hist_newton[:, 0, 0, 'g*', markersize=15, label='Optimum')
plt.plot("Newton's Method")
plt.title('x')
plt.xlabel('y')
plt.ylabel(
plt.legend()True, alpha=0.3)
plt.grid(
# Gradient descent
1, 3, 2)
plt.subplot(=20)
plt.contour(X, Y, Z, levels0], hist_gd[:, 1], 'bo-', markersize=3, linewidth=1, label='Gradient Descent')
plt.plot(hist_gd[:, 0, 0, 'g*', markersize=15, label='Optimum')
plt.plot('Gradient Descent')
plt.title('x')
plt.xlabel('y')
plt.ylabel(
plt.legend()True, alpha=0.3)
plt.grid(
# Convergence comparison
1, 3, 3)
plt.subplot(= [np.linalg.norm(h) for h in hist_newton]
newton_errors = [np.linalg.norm(h) for h in hist_gd]
gd_errors
'ro-', label="Newton's Method", linewidth=2)
plt.semilogy(newton_errors, 'bo-', label='Gradient Descent', linewidth=2)
plt.semilogy(gd_errors, 'Iteration')
plt.xlabel('Distance to Optimum (log scale)')
plt.ylabel('Convergence Comparison')
plt.title(
plt.legend()True, alpha=0.3)
plt.grid(
plt.tight_layout()
plt.show()
print(f"Newton's method: {len(hist_newton)} iterations")
print(f"Gradient descent: {len(hist_gd)} iterations")
print(f"Newton's method final error: {np.linalg.norm(opt_newton):.10f}")
print(f"Gradient descent final error: {np.linalg.norm(opt_gd):.10f}")
compare_newton_gd()
Quasi-Newton Methods (BFGS)
BFGS approximates the Hessian using gradient information:
def bfgs(f, grad_f, x0, max_iterations=100, tolerance=1e-6):
"""BFGS quasi-Newton method"""
= x0.copy()
x = len(x)
n = np.eye(n) # Initial Hessian approximation
B = [x.copy()]
history
= grad_f(x)
gradient
for i in range(max_iterations):
# Solve B * p = -gradient for search direction p
= -np.linalg.solve(B, gradient)
p
# Line search (simplified - use fixed step size)
= 1.0
alpha = x + alpha * p
x_new = grad_f(x_new)
gradient_new
# Check convergence
if np.linalg.norm(gradient_new) < tolerance:
print(f"BFGS converged after {i+1} iterations")
break
# BFGS update
= x_new - x
s = gradient_new - gradient
y
# Avoid division by zero
if np.dot(s, y) > 1e-10:
= 1.0 / np.dot(s, y)
rho = np.eye(n)
I = (I - rho * np.outer(s, y)) @ B @ (I - rho * np.outer(y, s)) + rho * np.outer(s, s)
B
= x_new
x = gradient_new
gradient
history.append(x.copy())
return x, np.array(history)
# Compare BFGS with other methods
def compare_all_methods():
"""Compare all optimization methods"""
= np.array([3.0, 2.0])
x0
= {
methods 'Gradient Descent': lambda: gradient_descent_basic(quadratic_function, quadratic_gradient, x0, 0.1),
'Newton': lambda: newtons_method(quadratic_function, quadratic_gradient,
lambda x: np.array([[2, 0], [0, 4]]), x0),
'BFGS': lambda: bfgs(quadratic_function, quadratic_gradient, x0),
'Adam': lambda: adam(quadratic_function, quadratic_gradient, x0, 0.1)
}
=(12, 8))
plt.figure(figsize= ['red', 'blue', 'green', 'orange']
colors
# Create contour plot
= np.linspace(-4, 4, 100)
x = np.linspace(-3, 3, 100)
y = np.meshgrid(x, y)
X, Y = X**2 + 2*Y**2
Z =15, alpha=0.6)
plt.contour(X, Y, Z, levels
for i, (name, method) in enumerate(methods.items()):
= method()
opt_x, history 0], history[:, 1], 'o-', color=colors[i],
plt.plot(history[:, =4, linewidth=2, label=f'{name} ({len(history)} iter)')
markersize
0, 0, 'k*', markersize=15, label='Optimum')
plt.plot('x')
plt.xlabel('y')
plt.ylabel('Optimization Methods Comparison')
plt.title(
plt.legend()True, alpha=0.3)
plt.grid(
plt.show()
compare_all_methods()
Applications in Machine Learning
Logistic Regression
def logistic_regression_optimization():
"""Optimize logistic regression using different methods"""
# Generate binary classification data
42)
np.random.seed(= 1000, 2
n_samples, n_features = np.random.randn(n_samples, n_features)
X = np.array([1.5, -2.0])
true_w = 0.5
true_b
# Generate labels
= X @ true_w + true_b
logits = 1 / (1 + np.exp(-logits))
probabilities = np.random.binomial(1, probabilities)
y
# Add bias term
= np.column_stack([np.ones(n_samples), X])
X_with_bias = np.array([true_b, true_w[0], true_w[1]])
true_params
# Logistic regression loss and gradient
def logistic_loss(params, X, y):
= X @ params
logits return np.mean(np.log(1 + np.exp(logits)) - y * logits)
def logistic_gradient(params, X, y):
= X @ params
logits = 1 / (1 + np.exp(-logits))
probabilities return X.T @ (probabilities - y) / len(y)
# Initialize parameters
= np.random.randn(X_with_bias.shape[1]) * 0.1
params0
# Optimize using different methods
= {
methods 'SGD': lambda: stochastic_gradient_descent(
X_with_bias, y, logistic_loss, logistic_gradient, params0,=0.1, batch_size=64, epochs=100
learning_rate
),'Adam': lambda: adam(
lambda p: logistic_loss(p, X_with_bias, y),
lambda p: logistic_gradient(p, X_with_bias, y),
=0.01, max_iterations=1000
params0, learning_rate
)
}
= {}
results for name, method in methods.items():
if name == 'SGD':
= method()
params_opt, history = history['loss']
loss_history else:
= method()
params_opt, param_history = [logistic_loss(p, X_with_bias, y) for p in param_history]
loss_history
= {
results[name] 'params': params_opt,
'loss_history': loss_history,
'final_loss': logistic_loss(params_opt, X_with_bias, y)
}
# Visualize results
=(15, 5))
plt.figure(figsize
# Decision boundary visualization
1, 3, 1)
plt.subplot(
# Plot data points
==0, 0], X[y==0, 1], c='red', marker='o', alpha=0.6, label='Class 0')
plt.scatter(X[y==1, 0], X[y==1, 1], c='blue', marker='s', alpha=0.6, label='Class 1')
plt.scatter(X[y
# Plot true decision boundary
= np.linspace(-3, 3, 100)
x_boundary = -(true_params[0] + true_params[1] * x_boundary) / true_params[2]
y_boundary 'g--', linewidth=2, label='True boundary')
plt.plot(x_boundary, y_boundary,
# Plot learned decision boundary (using Adam result)
= results['Adam']['params']
adam_params = -(adam_params[0] + adam_params[1] * x_boundary) / adam_params[2]
y_learned 'k-', linewidth=2, label='Learned boundary')
plt.plot(x_boundary, y_learned,
'Feature 1')
plt.xlabel('Feature 2')
plt.ylabel('Logistic Regression Decision Boundary')
plt.title(
plt.legend()True, alpha=0.3)
plt.grid(
# Loss convergence
1, 3, 2)
plt.subplot(for name, result in results.items():
'loss_history'], label=f'{name}', linewidth=2)
plt.plot(result[
'Iteration/Epoch')
plt.xlabel('Logistic Loss')
plt.ylabel('Loss Convergence')
plt.title(
plt.legend()True, alpha=0.3)
plt.grid(
# Parameter convergence
1, 3, 3)
plt.subplot(print("Parameter Comparison:")
print(f"True parameters: {true_params}")
for name, result in results.items():
= result['params']
params = np.linalg.norm(params - true_params)
error print(f"{name} parameters: {params}")
print(f"{name} error: {error:.6f}")
range(len(params)), params, alpha=0.7, label=f'{name}')
plt.bar(
range(len(true_params)), true_params, alpha=0.7,
plt.bar(='black', label='True', width=0.3)
color'Parameter Index')
plt.xlabel('Parameter Value')
plt.ylabel('Parameter Comparison')
plt.title(
plt.legend()True, alpha=0.3)
plt.grid(
plt.tight_layout()
plt.show()
logistic_regression_optimization()
Summary
Gradient-based optimization methods are essential for:
- Machine Learning: Training neural networks and other models
- Parameter Estimation: Finding optimal model parameters
- Function Minimization: Solving continuous optimization problems
- Large-Scale Optimization: Handling high-dimensional problems
Key insights: - Learning rate is crucial for convergence - Momentum helps accelerate convergence and reduce oscillations - Adaptive methods (Adam, AdaGrad) automatically adjust learning rates - Second-order methods converge faster but require more computation - Stochastic methods are essential for large datasets
The choice of optimization method depends on: - Problem size and computational budget - Gradient availability and computational cost - Convergence requirements - Noise in the objective function
Understanding these methods enables you to select appropriate optimizers for different machine learning and optimization tasks.