Optimization Techniques

Introduction

Performance optimization in C programming involves applying various techniques to improve execution speed, reduce memory usage, and enhance overall efficiency. This chapter explores fundamental and advanced optimization strategies, including algorithmic improvements, loop optimizations, data structure selection, and compiler-assisted optimizations.

Algorithmic Optimization

The most significant performance improvements often come from choosing better algorithms and data structures.

Time Complexity Analysis

Understanding algorithmic complexity is fundamental to optimization:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// O(n^2) bubble sort
void bubble_sort(int* arr, int n) {
    for (int i = 0; i < n - 1; i++) {
        for (int j = 0; j < n - i - 1; j++) {
            if (arr[j] > arr[j + 1]) {
                int temp = arr[j];
                arr[j] = arr[j + 1];
                arr[j + 1] = temp;
            }
        }
    }
}

// O(n log n) quick sort
void quick_sort(int* arr, int low, int high) {
    if (low < high) {
        int pi = partition(arr, low, high);
        quick_sort(arr, low, pi - 1);
        quick_sort(arr, pi + 1, high);
    }
}

int partition(int* arr, int low, int high) {
    int pivot = arr[high];
    int i = (low - 1);
    
    for (int j = low; j <= high - 1; j++) {
        if (arr[j] < pivot) {
            i++;
            int temp = arr[i];
            arr[i] = arr[j];
            arr[j] = temp;
        }
    }
    int temp = arr[i + 1];
    arr[i + 1] = arr[high];
    arr[high] = temp;
    return (i + 1);
}

// O(n log n) merge sort
void merge_sort(int* arr, int l, int r) {
    if (l < r) {
        int m = l + (r - l) / 2;
        merge_sort(arr, l, m);
        merge_sort(arr, m + 1, r);
        merge(arr, l, m, r);
    }
}

void merge(int* arr, int l, int m, int r) {
    int i, j, k;
    int n1 = m - l + 1;
    int n2 = r - m;
    
    int* L = malloc(n1 * sizeof(int));
    int* R = malloc(n2 * sizeof(int));
    
    for (i = 0; i < n1; i++)
        L[i] = arr[l + i];
    for (j = 0; j < n2; j++)
        R[j] = arr[m + 1 + j];
    
    i = 0;
    j = 0;
    k = l;
    
    while (i < n1 && j < n2) {
        if (L[i] <= R[j]) {
            arr[k] = L[i];
            i++;
        } else {
            arr[k] = R[j];
            j++;
        }
        k++;
    }
    
    while (i < n1) {
        arr[k] = L[i];
        i++;
        k++;
    }
    
    while (j < n2) {
        arr[k] = R[j];
        j++;
        k++;
    }
    
    free(L);
    free(R);
}

// Performance comparison
void compare_sorting_algorithms(void) {
    const int size = 10000;
    int* arr1 = malloc(size * sizeof(int));
    int* arr2 = malloc(size * sizeof(int));
    int* arr3 = malloc(size * sizeof(int));
    
    // Initialize arrays with random data
    for (int i = 0; i < size; i++) {
        arr1[i] = arr2[i] = arr3[i] = rand() % 10000;
    }
    
    struct timespec start, end;
    
    // Test bubble sort
    clock_gettime(CLOCK_MONOTONIC, &start);
    bubble_sort(arr1, size);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double bubble_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test quick sort
    clock_gettime(CLOCK_MONOTONIC, &start);
    quick_sort(arr2, 0, size - 1);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double quick_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test merge sort
    clock_gettime(CLOCK_MONOTONIC, &start);
    merge_sort(arr3, 0, size - 1);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double merge_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    printf("Sorting Algorithm Performance (n=%d):\n", size);
    printf("  Bubble Sort: %.6f seconds\n", bubble_time);
    printf("  Quick Sort:  %.6f seconds\n", quick_time);
    printf("  Merge Sort:  %.6f seconds\n", merge_time);
    
    free(arr1);
    free(arr2);
    free(arr3);
}

Search Algorithm Optimization

Choosing the right search algorithm can dramatically improve performance:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// O(n) linear search
int linear_search(int* arr, int n, int target) {
    for (int i = 0; i < n; i++) {
        if (arr[i] == target) {
            return i;
        }
    }
    return -1;
}

// O(log n) binary search (requires sorted array)
int binary_search(int* arr, int n, int target) {
    int left = 0;
    int right = n - 1;
    
    while (left <= right) {
        int mid = left + (right - left) / 2;
        
        if (arr[mid] == target) {
            return mid;
        }
        
        if (arr[mid] < target) {
            left = mid + 1;
        } else {
            right = mid - 1;
        }
    }
    
    return -1;
}

// Hash table implementation for O(1) average case search
#define HASH_TABLE_SIZE 1009

typedef struct hash_node {
    int key;
    int value;
    struct hash_node* next;
} hash_node_t;

static hash_node_t* hash_table[HASH_TABLE_SIZE];

unsigned int hash_function(int key) {
    return (unsigned int)(key % HASH_TABLE_SIZE);
}

void hash_table_insert(int key, int value) {
    unsigned int index = hash_function(key);
    
    hash_node_t* node = malloc(sizeof(hash_node_t));
    node->key = key;
    node->value = value;
    node->next = hash_table[index];
    hash_table[index] = node;
}

int hash_table_search(int key) {
    unsigned int index = hash_function(key);
    hash_node_t* node = hash_table[index];
    
    while (node != NULL) {
        if (node->key == key) {
            return node->value;
        }
        node = node->next;
    }
    
    return -1;
}

void hash_table_clear(void) {
    for (int i = 0; i < HASH_TABLE_SIZE; i++) {
        hash_node_t* node = hash_table[i];
        while (node != NULL) {
            hash_node_t* next = node->next;
            free(node);
            node = next;
        }
        hash_table[i] = NULL;
    }
}

// Performance comparison
void compare_search_algorithms(void) {
    const int size = 100000;
    int* sorted_array = malloc(size * sizeof(int));
    int* unsorted_array = malloc(size * sizeof(int));
    
    // Initialize arrays
    for (int i = 0; i < size; i++) {
        sorted_array[i] = i * 2;  // Even numbers
        unsorted_array[i] = rand() % (size * 2);
        hash_table_insert(unsorted_array[i], i);
    }
    
    int target = sorted_array[size / 2];  // Target in middle
    
    struct timespec start, end;
    
    // Test linear search
    clock_gettime(CLOCK_MONOTONIC, &start);
    int linear_result = linear_search(unsorted_array, size, target);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double linear_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test binary search
    clock_gettime(CLOCK_MONOTONIC, &start);
    int binary_result = binary_search(sorted_array, size, target);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double binary_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test hash table search
    clock_gettime(CLOCK_MONOTONIC, &start);
    int hash_result = hash_table_search(target);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double hash_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    printf("Search Algorithm Performance (n=%d):\n", size);
    printf("  Linear Search: %.9f seconds (result: %d)\n", linear_time, linear_result);
    printf("  Binary Search: %.9f seconds (result: %d)\n", binary_time, binary_result);
    printf("  Hash Search:   %.9f seconds (result: %d)\n", hash_time, hash_result);
    
    free(sorted_array);
    free(unsorted_array);
    hash_table_clear();
}

Loop Optimization

Loop optimization techniques can significantly improve performance by reducing overhead and improving cache locality.

Loop Unrolling

Loop unrolling reduces loop overhead by processing multiple iterations in each loop cycle:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// Standard loop
void standard_loop(int* arr, int n) {
    for (int i = 0; i < n; i++) {
        arr[i] = arr[i] * 2 + 1;
    }
}

// Unrolled loop (factor of 4)
void unrolled_loop(int* arr, int n) {
    int i;
    // Process 4 elements per iteration
    for (i = 0; i < n - 3; i += 4) {
        arr[i] = arr[i] * 2 + 1;
        arr[i + 1] = arr[i + 1] * 2 + 1;
        arr[i + 2] = arr[i + 2] * 2 + 1;
        arr[i + 3] = arr[i + 3] * 2 + 1;
    }
    
    // Handle remaining elements
    for (; i < n; i++) {
        arr[i] = arr[i] * 2 + 1;
    }
}

// Duff's device (advanced loop unrolling)
void duff_device(int* to, int* from, int count) {
    int n = (count + 7) / 8;
    switch (count % 8) {
        case 0: do { *to++ = *from++;
        case 7:      *to++ = *from++;
        case 6:      *to++ = *from++;
        case 5:      *to++ = *from++;
        case 4:      *to++ = *from++;
        case 3:      *to++ = *from++;
        case 2:      *to++ = *from++;
        case 1:      *to++ = *from++;
                } while (--n > 0);
    }
}

// Performance comparison
void compare_loop_optimizations(void) {
    const int size = 1000000;
    int* arr1 = malloc(size * sizeof(int));
    int* arr2 = malloc(size * sizeof(int));
    
    // Initialize arrays
    for (int i = 0; i < size; i++) {
        arr1[i] = arr2[i] = i;
    }
    
    struct timespec start, end;
    
    // Test standard loop
    clock_gettime(CLOCK_MONOTONIC, &start);
    standard_loop(arr1, size);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double standard_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test unrolled loop
    clock_gettime(CLOCK_MONOTONIC, &start);
    unrolled_loop(arr2, size);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double unrolled_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    printf("Loop Optimization Performance (n=%d):\n", size);
    printf("  Standard Loop: %.6f seconds\n", standard_time);
    printf("  Unrolled Loop: %.6f seconds\n", unrolled_time);
    printf("  Speedup: %.2fx\n", standard_time / unrolled_time);
    
    free(arr1);
    free(arr2);
}

Loop Fusion and Fission

Combining or splitting loops can improve cache performance:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define SIZE 1000000

// Loop fusion example
void fused_loops(float* a, float* b, float* c, float* d, int n) {
    for (int i = 0; i < n; i++) {
        a[i] = b[i] + c[i];  // First operation
        d[i] = a[i] * 2.0f;  // Second operation (depends on first)
    }
}

// Loop fission example
void fissioned_loops(float* a, float* b, float* c, float* d, int n) {
    // First loop
    for (int i = 0; i < n; i++) {
        a[i] = b[i] + c[i];
    }
    
    // Second loop
    for (int i = 0; i < n; i++) {
        d[i] = a[i] * 2.0f;
    }
}

// Cache-friendly loop fusion
void cache_friendly_fusion(float* a, float* b, float* c, float* d, int n) {
    const int block_size = 1024;  // Cache line friendly
    
    for (int block = 0; block < n; block += block_size) {
        int end = (block + block_size < n) ? block + block_size : n;
        
        // Process block
        for (int i = block; i < end; i++) {
            a[i] = b[i] + c[i];
            d[i] = a[i] * 2.0f;
        }
    }
}

// Performance comparison
void compare_loop_transformations(void) {
    float* a1 = malloc(SIZE * sizeof(float));
    float* b1 = malloc(SIZE * sizeof(float));
    float* c1 = malloc(SIZE * sizeof(float));
    float* d1 = malloc(SIZE * sizeof(float));
    
    float* a2 = malloc(SIZE * sizeof(float));
    float* b2 = malloc(SIZE * sizeof(float));
    float* c2 = malloc(SIZE * sizeof(float));
    float* d2 = malloc(SIZE * sizeof(float));
    
    float* a3 = malloc(SIZE * sizeof(float));
    float* b3 = malloc(SIZE * sizeof(float));
    float* c3 = malloc(SIZE * sizeof(float));
    float* d3 = malloc(SIZE * sizeof(float));
    
    // Initialize arrays
    for (int i = 0; i < SIZE; i++) {
        b1[i] = b2[i] = b3[i] = (float)i;
        c1[i] = c2[i] = c3[i] = (float)(i * 2);
    }
    
    struct timespec start, end;
    
    // Test fused loops
    clock_gettime(CLOCK_MONOTONIC, &start);
    fused_loops(a1, b1, c1, d1, SIZE);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double fused_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test fissioned loops
    clock_gettime(CLOCK_MONOTONIC, &start);
    fissioned_loops(a2, b2, c2, d2, SIZE);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double fissioned_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test cache-friendly fusion
    clock_gettime(CLOCK_MONOTONIC, &start);
    cache_friendly_fusion(a3, b3, c3, d3, SIZE);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double cache_friendly_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    printf("Loop Transformation Performance (n=%d):\n", SIZE);
    printf("  Fused Loops:        %.6f seconds\n", fused_time);
    printf("  Fissioned Loops:    %.6f seconds\n", fissioned_time);
    printf("  Cache-Friendly:     %.6f seconds\n", cache_friendly_time);
    
    free(a1); free(b1); free(c1); free(d1);
    free(a2); free(b2); free(c2); free(d2);
    free(a3); free(b3); free(c3); free(d3);
}

Data Structure Optimization

Choosing the right data structures can have a dramatic impact on performance.

Array vs Linked List

Different data structures have different performance characteristics:

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// Array-based implementation
typedef struct {
    int* data;
    int size;
    int capacity;
} array_list_t;

array_list_t* array_list_create(int initial_capacity) {
    array_list_t* list = malloc(sizeof(array_list_t));
    list->data = malloc(initial_capacity * sizeof(int));
    list->size = 0;
    list->capacity = initial_capacity;
    return list;
}

void array_list_append(array_list_t* list, int value) {
    if (list->size >= list->capacity) {
        list->capacity *= 2;
        list->data = realloc(list->data, list->capacity * sizeof(int));
    }
    list->data[list->size++] = value;
}

int array_list_get(array_list_t* list, int index) {
    if (index >= 0 && index < list->size) {
        return list->data[index];
    }
    return -1;
}

void array_list_free(array_list_t* list) {
    free(list->data);
    free(list);
}

// Linked list implementation
typedef struct node {
    int data;
    struct node* next;
} node_t;

typedef struct {
    node_t* head;
    int size;
} linked_list_t;

linked_list_t* linked_list_create(void) {
    linked_list_t* list = malloc(sizeof(linked_list_t));
    list->head = NULL;
    list->size = 0;
    return list;
}

void linked_list_append(linked_list_t* list, int value) {
    node_t* new_node = malloc(sizeof(node_t));
    new_node->data = value;
    new_node->next = NULL;
    
    if (list->head == NULL) {
        list->head = new_node;
    } else {
        node_t* current = list->head;
        while (current->next != NULL) {
            current = current->next;
        }
        current->next = new_node;
    }
    list->size++;
}

int linked_list_get(linked_list_t* list, int index) {
    node_t* current = list->head;
    for (int i = 0; i < index && current != NULL; i++) {
        current = current->next;
    }
    return (current != NULL) ? current->data : -1;
}

void linked_list_free(linked_list_t* list) {
    node_t* current = list->head;
    while (current != NULL) {
        node_t* next = current->next;
        free(current);
        current = next;
    }
    free(list);
}

// Performance comparison
void compare_data_structures(void) {
    const int size = 100000;
    
    struct timespec start, end;
    
    // Test array list
    clock_gettime(CLOCK_MONOTONIC, &start);
    array_list_t* arr_list = array_list_create(1000);
    for (int i = 0; i < size; i++) {
        array_list_append(arr_list, i);
    }
    for (int i = 0; i < size; i += 100) {
        array_list_get(arr_list, i);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    double array_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test linked list
    clock_gettime(CLOCK_MONOTONIC, &start);
    linked_list_t* link_list = linked_list_create();
    for (int i = 0; i < size; i++) {
        linked_list_append(link_list, i);
    }
    for (int i = 0; i < size; i += 100) {
        linked_list_get(link_list, i);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    double linked_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    printf("Data Structure Performance (n=%d):\n", size);
    printf("  Array List:  %.6f seconds\n", array_time);
    printf("  Linked List: %.6f seconds\n", linked_time);
    printf("  Ratio: %.2fx\n", linked_time / array_time);
    
    array_list_free(arr_list);
    linked_list_free(link_list);
}

Compiler Optimization Techniques

Understanding and leveraging compiler optimizations is crucial for performance.

Compiler Flags and Pragmas

#include <stdio.h>
#include <stdlib.h>

// Function inlining hints
static inline int fast_add(int a, int b) {
    return a + b;
}

// Restrict keyword for pointer aliasing
void vector_add(float* restrict a, float* restrict b, float* restrict c, int n) {
    for (int i = 0; i < n; i++) {
        c[i] = a[i] + b[i];
    }
}

// Loop unrolling pragma
void optimized_loop(float* arr, int n) {
    #pragma GCC unroll 4
    for (int i = 0; i < n; i++) {
        arr[i] = arr[i] * 2.0f + 1.0f;
    }
}

// Vectorization hints
void vectorized_operation(float* arr, int n) {
    #pragma GCC ivdep
    for (int i = 0; i < n; i++) {
        arr[i] = arr[i] * arr[i] + 2.0f * arr[i] + 1.0f;
    }
}

// Branch prediction hints
void branch_prediction_example(int* arr, int n) {
    for (int i = 0; i < n; i++) {
        if (__builtin_expect(arr[i] > 0, 1)) {  // Likely true
            arr[i] *= 2;
        } else {
            arr[i] = 0;
        }
    }
}

// Memory alignment
struct aligned_struct {
    int a;
    double b;
    char c;
} __attribute__((aligned(16)));

// Example usage
void compiler_optimization_examples(void) {
    const int size = 1000000;
    float* a = malloc(size * sizeof(float));
    float* b = malloc(size * sizeof(float));
    float* c = malloc(size * sizeof(float));
    
    // Initialize arrays
    for (int i = 0; i < size; i++) {
        a[i] = b[i] = (float)i;
    }
    
    // Test optimized operations
    vector_add(a, b, c, size);
    optimized_loop(c, size);
    vectorized_operation(c, size);
    
    printf("Compiler optimization examples completed\n");
    
    free(a);
    free(b);
    free(c);
}

Mathematical Optimization

Mathematical optimizations can significantly improve performance in numerical computations.

Fast Mathematical Operations

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

// Fast integer square root (Newton's method)
int fast_sqrt(int x) {
    if (x == 0) return 0;
    
    int guess = x;
    int better_guess;
    
    do {
        better_guess = (guess + x / guess) / 2;
        if (better_guess >= guess) break;
        guess = better_guess;
    } while (1);
    
    return guess;
}

// Fast integer power
int fast_power(int base, int exp) {
    int result = 1;
    while (exp > 0) {
        if (exp & 1) {
            result *= base;
        }
        base *= base;
        exp >>= 1;
    }
    return result;
}

// Bit manipulation for multiplication by powers of 2
int multiply_by_power_of_2(int value, int power) {
    return value << power;  // Equivalent to value * (2^power)
}

// Bit manipulation for division by powers of 2
int divide_by_power_of_2(int value, int power) {
    return value >> power;  // Equivalent to value / (2^power) for positive numbers
}

// Fast absolute value
int fast_abs(int x) {
    int mask = x >> (sizeof(int) * 8 - 1);
    return (x + mask) ^ mask;
}

// Performance comparison
void compare_mathematical_optimizations(void) {
    const int iterations = 1000000;
    
    struct timespec start, end;
    
    // Test standard sqrt
    clock_gettime(CLOCK_MONOTONIC, &start);
    volatile double sum1 = 0;
    for (int i = 1; i <= iterations; i++) {
        sum1 += sqrt(i);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    double sqrt_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test fast sqrt
    clock_gettime(CLOCK_MONOTONIC, &start);
    volatile int sum2 = 0;
    for (int i = 1; i <= iterations; i++) {
        sum2 += fast_sqrt(i);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    double fast_sqrt_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test standard power
    clock_gettime(CLOCK_MONOTONIC, &start);
    volatile int sum3 = 0;
    for (int i = 1; i <= 1000; i++) {
        sum3 += (int)pow(2, i % 10);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    double power_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test fast power
    clock_gettime(CLOCK_MONOTONIC, &start);
    volatile int sum4 = 0;
    for (int i = 1; i <= 1000; i++) {
        sum4 += fast_power(2, i % 10);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);
    double fast_power_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    printf("Mathematical Optimization Performance:\n");
    printf("  Standard sqrt: %.6f seconds\n", sqrt_time);
    printf("  Fast sqrt:     %.6f seconds\n", fast_sqrt_time);
    printf("  Standard power: %.6f seconds\n", power_time);
    printf("  Fast power:     %.6f seconds\n", fast_power_time);
}

Practical Examples

Image Processing Optimization

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define IMAGE_WIDTH 1920
#define IMAGE_HEIGHT 1080

// Simple image structure
typedef struct {
    unsigned char* data;
    int width;
    int height;
} image_t;

// Create image
image_t* create_image(int width, int height) {
    image_t* img = malloc(sizeof(image_t));
    img->data = malloc(width * height * sizeof(unsigned char));
    img->width = width;
    img->height = height;
    return img;
}

// Free image
void free_image(image_t* img) {
    free(img->data);
    free(img);
}

// Naive blur implementation
void naive_blur(image_t* src, image_t* dst) {
    for (int y = 1; y < src->height - 1; y++) {
        for (int x = 1; x < src->width - 1; x++) {
            int sum = 0;
            for (int ky = -1; ky <= 1; ky++) {
                for (int kx = -1; kx <= 1; kx++) {
                    sum += src->data[(y + ky) * src->width + (x + kx)];
                }
            }
            dst->data[y * src->width + x] = sum / 9;
        }
    }
}

// Optimized blur with loop unrolling
void optimized_blur(image_t* src, image_t* dst) {
    for (int y = 1; y < src->height - 1; y++) {
        for (int x = 1; x < src->width - 1; x++) {
            // Unroll the inner loops
            int sum = src->data[(y-1) * src->width + (x-1)] +
                      src->data[(y-1) * src->width + x] +
                      src->data[(y-1) * src->width + (x+1)] +
                      src->data[y * src->width + (x-1)] +
                      src->data[y * src->width + x] +
                      src->data[y * src->width + (x+1)] +
                      src->data[(y+1) * src->width + (x-1)] +
                      src->data[(y+1) * src->width + x] +
                      src->data[(y+1) * src->width + (x+1)];
            dst->data[y * src->width + x] = sum / 9;
        }
    }
}

// SIMD-style optimization using multiple operations per loop
void simd_style_blur(image_t* src, image_t* dst) {
    for (int y = 1; y < src->height - 1; y++) {
        // Process 4 pixels at a time
        for (int x = 1; x < src->width - 4; x += 4) {
            // Process 4 pixels in parallel (simplified)
            for (int i = 0; i < 4; i++) {
                int sum = src->data[(y-1) * src->width + (x+i-1)] +
                          src->data[(y-1) * src->width + (x+i)] +
                          src->data[(y-1) * src->width + (x+i+1)] +
                          src->data[y * src->width + (x+i-1)] +
                          src->data[y * src->width + (x+i)] +
                          src->data[y * src->width + (x+i+1)] +
                          src->data[(y+1) * src->width + (x+i-1)] +
                          src->data[(y+1) * src->width + (x+i)] +
                          src->data[(y+1) * src->width + (x+i+1)];
                dst->data[y * src->width + (x+i)] = sum / 9;
            }
        }
    }
}

// Performance comparison
void compare_image_processing(void) {
    image_t* src = create_image(IMAGE_WIDTH, IMAGE_HEIGHT);
    image_t* dst1 = create_image(IMAGE_WIDTH, IMAGE_HEIGHT);
    image_t* dst2 = create_image(IMAGE_WIDTH, IMAGE_HEIGHT);
    image_t* dst3 = create_image(IMAGE_WIDTH, IMAGE_HEIGHT);
    
    // Initialize source image with random data
    for (int i = 0; i < IMAGE_WIDTH * IMAGE_HEIGHT; i++) {
        src->data[i] = rand() % 256;
    }
    
    struct timespec start, end;
    
    // Test naive blur
    clock_gettime(CLOCK_MONOTONIC, &start);
    naive_blur(src, dst1);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double naive_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test optimized blur
    clock_gettime(CLOCK_MONOTONIC, &start);
    optimized_blur(src, dst2);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double optimized_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    // Test SIMD-style blur
    clock_gettime(CLOCK_MONOTONIC, &start);
    simd_style_blur(src, dst3);
    clock_gettime(CLOCK_MONOTONIC, &end);
    double simd_time = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
    
    printf("Image Processing Performance (%dx%d):\n", IMAGE_WIDTH, IMAGE_HEIGHT);
    printf("  Naive Blur:     %.6f seconds\n", naive_time);
    printf("  Optimized Blur: %.6f seconds\n", optimized_time);
    printf("  SIMD-style:     %.6f seconds\n", simd_time);
    printf("  Speedup:        %.2fx\n", naive_time / simd_time);
    
    free_image(src);
    free_image(dst1);
    free_image(dst2);
    free_image(dst3);
}

Summary

Optimization techniques in C programming encompass a wide range of approaches:

Algorithmic Optimization - Choosing better algorithms and data structures
Loop Optimization - Unrolling, fusion, and restructuring loops
Data Structure Optimization - Selecting appropriate data structures for the task
Compiler Optimization - Leveraging compiler features and flags
Mathematical Optimization - Fast mathematical operations and bit manipulation
Memory Access Optimization - Improving cache locality and reducing memory overhead

Key principles for effective optimization: - Profile first to identify actual bottlenecks - Focus on algorithmic improvements before micro-optimizations - Understand the trade-offs between performance and maintainability - Use appropriate compiler optimization flags - Consider the target hardware architecture - Test optimizations thoroughly to ensure correctness - Document optimization decisions for future maintenance

These techniques enable developers to create high-performance C applications that efficiently utilize system resources while maintaining code quality and readability.