Advanced Optimization
Introduction
Advanced optimization techniques go beyond basic algorithmic improvements to leverage hardware-specific features, compiler optimizations, and parallel processing capabilities. This chapter explores sophisticated methods for maximizing performance in C programs, including vectorization, parallel processing, compiler-assisted optimizations, and specialized techniques for specific architectures.
Compiler Optimization
Modern compilers provide powerful optimization capabilities that can significantly improve performance with minimal programmer effort.
Optimization Levels
Different compiler optimization levels offer varying degrees of optimization:
// Example showing the impact of different optimization levels
// Compile with: gcc -O0, -O1, -O2, -O3, -Os
#include <stdio.h>
#include <time.h>
#define ITERATIONS 100000000
// Simple loop that can be optimized
int compute_sum(int n) {
int sum = 0;
for (int i = 0; i < n; i++) {
sum += i * 2 + 1;
}
return sum;
}
void optimization_levels_demo(void) {
clock_t start = clock();
volatile int result = compute_sum(ITERATIONS);
clock_t end = clock();
double time_spent = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Computation time: %.4f seconds\n", time_spent);
}Compiler Hints and Pragmas
Compiler hints can guide optimization decisions:
#include <stdio.h>
#include <stdlib.h>
// Function likely to be called frequently
__attribute__((hot))
int hot_function(int x) {
return x * x + 2 * x + 1;
}
// Function unlikely to be called
__attribute__((cold))
void error_handler(void) {
printf("Error occurred!\n");
exit(1);
}
// Loop unrolling hint
void loop_unrolling_example(void) {
int array[1000];
// Hint to unroll this loop
#pragma GCC unroll 4
for (int i = 0; i < 1000; i++) {
array[i] = i * i;
}
}
// Vectorization hints
void vectorization_example(float *a, float *b, float *c, int n) {
// Tell compiler this loop can be vectorized
#pragma GCC ivdep
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}Profile-Guided Optimization (PGO)
PGO uses runtime profiling data to optimize code placement and branch prediction:
// Example of code that benefits from PGO
#include <stdio.h>
int process_data(int data) {
// Branch that depends on input data
if (data < 100) {
// Common case
return data * 2;
} else if (data < 1000) {
// Less common case
return data + 100;
} else {
// Rare case
return data / 2;
}
}
void pgo_example(void) {
// In a real PGO scenario, you would run the program with
// representative data to generate profile data
for (int i = 0; i < 10000; i++) {
int result = process_data(i);
// Use result to prevent optimization
volatile int dummy = result;
}
}Vectorization
Vectorization allows processing multiple data elements simultaneously using SIMD (Single Instruction, Multiple Data) instructions.
Manual Vectorization with Intrinsics
Using compiler intrinsics for explicit vectorization:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#ifdef __SSE__
#include <xmmintrin.h> // SSE
#endif
#ifdef __AVX__
#include <immintrin.h> // AVX
#endif
// Scalar addition
void scalar_add(float *a, float *b, float *c, int n) {
for (int i = 0; i < n; i++) {
c[i] = a[i] + b[i];
}
}
#ifdef __SSE__
// SSE vectorized addition
void sse_add(float *a, float *b, float *c, int n) {
int i = 0;
// Process 4 elements at a time
for (; i <= n - 4; i += 4) {
__m128 va = _mm_load_ps(&a[i]);
__m128 vb = _mm_load_ps(&b[i]);
__m128 vc = _mm_add_ps(va, vb);
_mm_store_ps(&c[i], vc);
}
// Handle remaining elements
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}
#endif
#ifdef __AVX__
// AVX vectorized addition
void avx_add(float *a, float *b, float *c, int n) {
int i = 0;
// Process 8 elements at a time
for (; i <= n - 8; i += 8) {
__m256 va = _mm256_load_ps(&a[i]);
__m256 vb = _mm256_load_ps(&b[i]);
__m256 vc = _mm256_add_ps(va, vb);
_mm256_store_ps(&c[i], vc);
}
// Handle remaining elements
for (; i < n; i++) {
c[i] = a[i] + b[i];
}
}
#endif
void vectorization_comparison(void) {
const int size = 1000000;
float *a = malloc(size * sizeof(float));
float *b = malloc(size * sizeof(float));
float *c = malloc(size * sizeof(float));
// Initialize arrays
for (int i = 0; i < size; i++) {
a[i] = (float)i;
b[i] = (float)(i * 2);
}
clock_t start, end;
// Scalar version
start = clock();
scalar_add(a, b, c, size);
end = clock();
double scalar_time = ((double)(end - start)) / CLOCKS_PER_SEC;
#ifdef __SSE__
// SSE version
start = clock();
sse_add(a, b, c, size);
end = clock();
double sse_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("SSE time: %.4f seconds (%.2fx speedup)\n",
sse_time, scalar_time / sse_time);
#endif
#ifdef __AVX__
// AVX version
start = clock();
avx_add(a, b, c, size);
end = clock();
double avx_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("AVX time: %.4f seconds (%.2fx speedup)\n",
avx_time, scalar_time / avx_time);
#endif
printf("Scalar time: %.4f seconds\n", scalar_time);
free(a);
free(b);
free(c);
}Auto-Vectorization
Modern compilers can automatically vectorize suitable loops:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define ARRAY_SIZE 1000000
// Compiler can auto-vectorize this function
void auto_vectorizable_loop(float *a, float *b, float *c, int n) {
// Conditions for auto-vectorization:
// 1. No dependencies between iterations
// 2. Simple arithmetic operations
// 3. Contiguous memory access
// 4. Known loop bounds
for (int i = 0; i < n; i++) {
c[i] = a[i] * b[i] + 1.0f;
}
}
// This loop cannot be auto-vectorized due to dependency
void non_vectorizable_loop(int *a, int n) {
// Each iteration depends on the previous one
for (int i = 1; i < n; i++) {
a[i] = a[i-1] + i;
}
}
void auto_vectorization_example(void) {
float *a = malloc(ARRAY_SIZE * sizeof(float));
float *b = malloc(ARRAY_SIZE * sizeof(float));
float *c = malloc(ARRAY_SIZE * sizeof(float));
// Initialize arrays
for (int i = 0; i < ARRAY_SIZE; i++) {
a[i] = (float)i;
b[i] = (float)(i * 2);
}
clock_t start = clock();
auto_vectorizable_loop(a, b, c, ARRAY_SIZE);
clock_t end = clock();
double time_spent = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Auto-vectorized loop time: %.4f seconds\n", time_spent);
// Verify result (check first few elements)
printf("First 5 results: ");
for (int i = 0; i < 5; i++) {
printf("%.0f ", c[i]);
}
printf("\n");
free(a);
free(b);
free(c);
}Parallel Processing
Utilizing multiple CPU cores can dramatically improve performance for suitable workloads.
Thread-Level Parallelism with Pthreads
Manual thread management for fine-grained control:
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include <time.h>
#define NUM_THREADS 4
#define ARRAY_SIZE 1000000
typedef struct {
int *array;
int start;
int end;
long long sum;
} thread_data_t;
// Thread function to compute partial sum
void* compute_partial_sum(void *arg) {
thread_data_t *data = (thread_data_t*)arg;
data->sum = 0;
for (int i = data->start; i < data->end; i++) {
data->sum += data->array[i];
}
return NULL;
}
long long parallel_sum_pthreads(int *array, int n) {
pthread_t threads[NUM_THREADS];
thread_data_t thread_data[NUM_THREADS];
// Divide work among threads
int chunk_size = n / NUM_THREADS;
for (int i = 0; i < NUM_THREADS; i++) {
thread_data[i].array = array;
thread_data[i].start = i * chunk_size;
thread_data[i].end = (i == NUM_THREADS - 1) ? n : (i + 1) * chunk_size;
pthread_create(&threads[i], NULL, compute_partial_sum, &thread_data[i]);
}
// Wait for all threads and accumulate results
long long total_sum = 0;
for (int i = 0; i < NUM_THREADS; i++) {
pthread_join(threads[i], NULL);
total_sum += thread_data[i].sum;
}
return total_sum;
}
long long sequential_sum(int *array, int n) {
long long sum = 0;
for (int i = 0; i < n; i++) {
sum += array[i];
}
return sum;
}
void pthreads_example(void) {
int *array = malloc(ARRAY_SIZE * sizeof(int));
// Initialize array
for (int i = 0; i < ARRAY_SIZE; i++) {
array[i] = i % 1000;
}
// Sequential sum
clock_t start = clock();
long long seq_sum = sequential_sum(array, ARRAY_SIZE);
clock_t end = clock();
double sequential_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Parallel sum
start = clock();
long long par_sum = parallel_sum_pthreads(array, ARRAY_SIZE);
end = clock();
double parallel_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Sequential sum: %lld (time: %.4f seconds)\n", seq_sum, sequential_time);
printf("Parallel sum: %lld (time: %.4f seconds)\n", par_sum, parallel_time);
printf("Speedup: %.2fx\n", sequential_time / parallel_time);
free(array);
}Architecture-Specific Optimizations
Different CPU architectures have unique features that can be leveraged for performance.
Cache Optimization Techniques
Optimizing for cache hierarchy and behavior:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define MATRIX_SIZE 1024
// Cache-unfriendly matrix transpose
void transpose_unfriendly(int n, int *src, int *dst) {
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
dst[j * n + i] = src[i * n + j];
}
}
}
// Cache-friendly matrix transpose with blocking
void transpose_blocked(int n, int *src, int *dst) {
const int block_size = 32; // Should match cache line size
for (int ii = 0; ii < n; ii += block_size) {
for (int jj = 0; jj < n; jj += block_size) {
for (int i = ii; i < ii + block_size && i < n; i++) {
for (int j = jj; j < jj + block_size && j < n; j++) {
dst[j * n + i] = src[i * n + j];
}
}
}
}
}
void cache_optimization_example(void) {
int *matrix = malloc(MATRIX_SIZE * MATRIX_SIZE * sizeof(int));
int *transposed = malloc(MATRIX_SIZE * MATRIX_SIZE * sizeof(int));
// Initialize matrix
for (int i = 0; i < MATRIX_SIZE * MATRIX_SIZE; i++) {
matrix[i] = i;
}
clock_t start, end;
// Unfriendly transpose
start = clock();
transpose_unfriendly(MATRIX_SIZE, matrix, transposed);
end = clock();
double unfriendly_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Blocked transpose
start = clock();
transpose_blocked(MATRIX_SIZE, matrix, transposed);
end = clock();
double blocked_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Unfriendly transpose: %.4f seconds\n", unfriendly_time);
printf("Blocked transpose: %.4f seconds\n", blocked_time);
printf("Improvement: %.2fx\n", unfriendly_time / blocked_time);
free(matrix);
free(transposed);
}Branch Prediction Optimization
Improving branch prediction accuracy for better performance:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define ARRAY_SIZE 1000000
// Function with unpredictable branches
void unpredictable_branches(int *array, int n) {
for (int i = 0; i < n; i++) {
// Random condition leads to poor branch prediction
if (rand() % 2) {
array[i] *= 2;
} else {
array[i] += 1;
}
}
}
// Function with predictable branches
void predictable_branches(int *array, int n) {
// Process even indices differently from odd indices
for (int i = 0; i < n; i++) {
if (i % 2 == 0) {
array[i] *= 2;
} else {
array[i] += 1;
}
}
}
// Branchless version using conditional moves
void branchless_version(int *array, int n) {
for (int i = 0; i < n; i++) {
int is_even = (i % 2 == 0);
array[i] = array[i] * (1 + is_even) + (1 - is_even);
}
}
void branch_prediction_example(void) {
int *array1 = malloc(ARRAY_SIZE * sizeof(int));
int *array2 = malloc(ARRAY_SIZE * sizeof(int));
int *array3 = malloc(ARRAY_SIZE * sizeof(int));
// Initialize arrays
for (int i = 0; i < ARRAY_SIZE; i++) {
array1[i] = array2[i] = array3[i] = i;
}
clock_t start, end;
// Unpredictable branches
srand(42);
start = clock();
unpredictable_branches(array1, ARRAY_SIZE);
end = clock();
double unpredictable_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Predictable branches
start = clock();
predictable_branches(array2, ARRAY_SIZE);
end = clock();
double predictable_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Branchless version
start = clock();
branchless_version(array3, ARRAY_SIZE);
end = clock();
double branchless_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Unpredictable branches: %.4f seconds\n", unpredictable_time);
printf("Predictable branches: %.4f seconds\n", predictable_time);
printf("Branchless version: %.4f seconds\n", branchless_time);
printf("Predictable improvement: %.2fx\n", unpredictable_time / predictable_time);
printf("Branchless improvement: %.2fx\n", unpredictable_time / branchless_time);
free(array1);
free(array2);
free(array3);
}Memory-Level Parallelism
Exploiting memory-level parallelism to hide latency:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define ARRAY_SIZE 1000000
// Function that can benefit from memory-level parallelism
void memory_parallelism_example(void) {
double *a = malloc(ARRAY_SIZE * sizeof(double));
double *b = malloc(ARRAY_SIZE * sizeof(double));
double *c = malloc(ARRAY_SIZE * sizeof(double));
double *d = malloc(ARRAY_SIZE * sizeof(double));
// Initialize arrays
for (int i = 0; i < ARRAY_SIZE; i++) {
a[i] = (double)i;
b[i] = (double)(i * 2);
c[i] = (double)(i * 3);
d[i] = 0.0;
}
clock_t start = clock();
// Independent operations that can be executed in parallel
// Modern processors can execute multiple loads/stores simultaneously
for (int i = 0; i < ARRAY_SIZE; i++) {
d[i] = a[i] + b[i] + c[i];
}
clock_t end = clock();
double time_spent = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Memory parallelism example time: %.4f seconds\n", time_spent);
printf("First few results: ");
for (int i = 0; i < 5; i++) {
printf("%.0f ", d[i]);
}
printf("\n");
free(a);
free(b);
free(c);
free(d);
}Advanced Profiling and Analysis
Using sophisticated tools for performance analysis:
#include <stdio.h>
#include <stdlib.h>
#ifdef __linux__
#include <sys/resource.h>
#endif
// Function to measure memory usage
void print_memory_usage(void) {
#ifdef __linux__
struct rusage usage;
if (getrusage(RUSAGE_SELF, &usage) == 0) {
printf("Maximum resident set size: %ld KB\n", usage.ru_maxrss);
printf("Page reclaims: %ld\n", usage.ru_minflt);
printf("Page faults: %ld\n", usage.ru_majflt);
}
#endif
}
// Function to demonstrate performance counters (conceptual)
void performance_counter_example(void) {
printf("Performance counter example:\n");
printf("- Instructions per cycle (IPC)\n");
printf("- Cache miss rates\n");
printf("- Branch misprediction rate\n");
printf("- Memory bandwidth utilization\n");
// In practice, you would use tools like:
// - perf (Linux)
// - Intel VTune
// - Apple Instruments
// - GNU gprof
}Optimization Decision Making
Making informed decisions about which optimizations to apply:
#include <stdio.h>
#include <time.h>
// Example showing when optimization matters
void optimization_decision_example(void) {
printf("When to optimize:\n");
printf("1. Profile first - identify actual bottlenecks\n");
printf("2. Consider algorithmic improvements before micro-optimizations\n");
printf("3. Optimize hot paths - functions called frequently\n");
printf("4. Balance performance with maintainability\n");
printf("5. Consider target platform and constraints\n");
printf("6. Measure impact of each optimization\n");
// Simple example of measuring optimization impact
const int iterations = 100000000;
clock_t start = clock();
volatile long long sum = 0;
for (int i = 0; i < iterations; i++) {
sum += i;
}
clock_t end = clock();
double time_spent = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("\nSimple loop time: %.4f seconds\n", time_spent);
}Practical Optimization Workflow
A systematic approach to performance optimization:
#include <stdio.h>
void optimization_workflow(void) {
printf("Performance Optimization Workflow:\n\n");
printf("1. Profiling and Measurement\n");
printf(" - Use profiling tools to identify bottlenecks\n");
printf(" - Measure baseline performance\n");
printf(" - Set performance goals\n\n");
printf("2. Algorithmic Analysis\n");
printf(" - Review algorithmic complexity\n");
printf(" - Consider alternative algorithms\n");
printf(" - Optimize data structures\n\n");
printf("3. Implementation Optimization\n");
printf(" - Apply compiler optimizations\n");
printf(" - Use appropriate data layouts\n");
printf(" - Minimize memory allocations\n\n");
printf("4. Hardware-Specific Tuning\n");
printf(" - Leverage SIMD instructions\n");
printf(" - Utilize multiple cores\n");
printf(" - Optimize for cache hierarchy\n\n");
printf("5. Validation and Testing\n");
printf(" - Verify correctness after optimization\n");
printf(" - Measure performance improvements\n");
printf(" - Test on target hardware\n\n");
printf("6. Documentation and Maintenance\n");
printf(" - Document optimization decisions\n");
printf(" - Monitor performance over time\n");
printf(" - Maintain balance between performance and readability\n");
}Summary
Advanced optimization techniques require a deep understanding of both the problem domain and the underlying hardware. Key takeaways include:
- Compiler Optimization - Leverage compiler flags and hints for automatic improvements
- Vectorization - Use SIMD instructions to process multiple data elements simultaneously
- Parallel Processing - Distribute work across multiple CPU cores using OpenMP or pthreads
- Architecture-Specific Optimizations - Tailor code for specific CPU features and cache behaviors
- Memory-Level Parallelism - Exploit the ability to perform multiple memory operations simultaneously
- Branch Prediction - Structure code to improve CPU branch prediction accuracy
- Systematic Approach - Follow a methodical workflow for identifying and addressing performance bottlenecks
Important principles to remember:
- Profile First - Always measure before and after optimization
- Algorithmic Improvements - Often provide bigger gains than micro-optimizations
- Platform Awareness - Consider target hardware characteristics
- Maintainability - Balance performance with code readability and maintainability
- Verification - Ensure optimizations don’t introduce bugs or change behavior
These advanced techniques, when applied appropriately, can yield substantial performance improvements in performance-critical C applications. However, they should be used judiciously and only after identifying actual bottlenecks through profiling.