Memory Optimization
Introduction
Memory optimization is a critical aspect of performance tuning in C programs, especially in resource-constrained environments and high-performance applications. Efficient memory management can significantly impact program performance through improved cache utilization, reduced allocation overhead, and better data locality. This chapter explores techniques for optimizing memory usage, from allocation strategies to cache-friendly data structures.
Memory Hierarchy and Performance
Understanding the memory hierarchy is fundamental to effective memory optimization. Modern computer systems have multiple levels of memory with vastly different access times.
Memory Latency Comparison
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
// Example illustrating memory access time differences
void memory_latency_example(void) {
printf("Typical Access Times:\n");
printf("- CPU Register: ~0.3 ns\n");
printf("- L1 Cache: ~1 ns\n");
printf("- L2 Cache: ~4 ns\n");
printf("- L3 Cache: ~15 ns\n");
printf("- Main Memory: ~100 ns\n");
printf("- SSD: ~10 μs\n");
printf("- HDD: ~10 ms\n");
}Cache Behavior Impact
Cache misses can cause significant performance degradation. Understanding how data is accessed affects cache performance:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define MATRIX_SIZE 1024
#define ITERATIONS 100
// Row-major access (cache-friendly)
double row_major_sum(double matrix[MATRIX_SIZE][MATRIX_SIZE]) {
double sum = 0.0;
for (int i = 0; i < MATRIX_SIZE; i++) {
for (int j = 0; j < MATRIX_SIZE; j++) {
sum += matrix[i][j];
}
}
return sum;
}
// Column-major access (cache-unfriendly)
double column_major_sum(double matrix[MATRIX_SIZE][MATRIX_SIZE]) {
double sum = 0.0;
for (int j = 0; j < MATRIX_SIZE; j++) {
for (int i = 0; i < MATRIX_SIZE; i++) {
sum += matrix[i][j];
}
}
return sum;
}
void cache_performance_comparison(void) {
double (*matrix)[MATRIX_SIZE] = malloc(MATRIX_SIZE * sizeof(*matrix));
// Initialize matrix
for (int i = 0; i < MATRIX_SIZE; i++) {
for (int j = 0; j < MATRIX_SIZE; j++) {
matrix[i][j] = (double)(i * MATRIX_SIZE + j);
}
}
clock_t start, end;
// Row-major timing
start = clock();
for (int i = 0; i < ITERATIONS; i++) {
volatile double sum = row_major_sum(matrix);
}
end = clock();
double row_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Column-major timing
start = clock();
for (int i = 0; i < ITERATIONS; i++) {
volatile double sum = column_major_sum(matrix);
}
end = clock();
double col_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Row-major time: %.4f seconds\n", row_time);
printf("Column-major time: %.4f seconds\n", col_time);
printf("Performance difference: %.2fx\n", col_time / row_time);
free(matrix);
}Memory Allocation Optimization
Standard memory allocation functions like malloc() and free() can be significant performance bottlenecks in performance-critical applications.
Custom Memory Allocators
For specific use cases, custom allocators can provide significant performance improvements:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
// Simple pool allocator
typedef struct {
void *memory_pool;
size_t pool_size;
size_t block_size;
size_t num_blocks;
uint8_t *free_blocks;
size_t next_free;
} pool_allocator_t;
// Initialize pool allocator
pool_allocator_t* pool_init(size_t block_size, size_t num_blocks) {
pool_allocator_t *pool = malloc(sizeof(pool_allocator_t));
if (!pool) return NULL;
pool->pool_size = block_size * num_blocks;
pool->memory_pool = malloc(pool->pool_size);
if (!pool->memory_pool) {
free(pool);
return NULL;
}
pool->block_size = block_size;
pool->num_blocks = num_blocks;
pool->free_blocks = calloc(num_blocks, sizeof(uint8_t));
pool->next_free = 0;
return pool;
}
// Allocate block from pool
void* pool_alloc(pool_allocator_t *pool) {
if (!pool || pool->next_free >= pool->num_blocks) {
return NULL; // Pool exhausted
}
// Find free block
for (size_t i = 0; i < pool->num_blocks; i++) {
size_t index = (pool->next_free + i) % pool->num_blocks;
if (!pool->free_blocks[index]) {
pool->free_blocks[index] = 1;
pool->next_free = (index + 1) % pool->num_blocks;
return (char*)pool->memory_pool + (index * pool->block_size);
}
}
return NULL; // Should not happen
}
// Free block back to pool
void pool_free(pool_allocator_t *pool, void *ptr) {
if (!pool || !ptr) return;
// Calculate block index
ptrdiff_t offset = (char*)ptr - (char*)pool->memory_pool;
if (offset < 0 || offset >= (ptrdiff_t)pool->pool_size) return;
size_t index = offset / pool->block_size;
if (index < pool->num_blocks) {
pool->free_blocks[index] = 0;
pool->next_free = index; // Prefer this block for next allocation
}
}
// Destroy pool allocator
void pool_destroy(pool_allocator_t *pool) {
if (pool) {
free(pool->memory_pool);
free(pool->free_blocks);
free(pool);
}
}
// Example usage
void pool_allocator_example(void) {
// Create pool for 100 blocks of 64 bytes each
pool_allocator_t *pool = pool_init(64, 100);
if (!pool) {
printf("Failed to create pool\n");
return;
}
// Allocate some blocks
void *ptr1 = pool_alloc(pool);
void *ptr2 = pool_alloc(pool);
void *ptr3 = pool_alloc(pool);
printf("Allocated 3 blocks\n");
// Free some blocks
pool_free(pool, ptr1);
pool_free(pool, ptr3);
printf("Freed 2 blocks\n");
// Allocate more blocks (should reuse freed ones)
void *ptr4 = pool_alloc(pool);
void *ptr5 = pool_alloc(pool);
printf("Allocated 2 more blocks\n");
// Cleanup
pool_destroy(pool);
}Stack Allocation vs Heap Allocation
Stack allocation is significantly faster than heap allocation but has size and lifetime limitations:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define ARRAY_SIZE 10000
#define ITERATIONS 100000
// Stack allocation
void stack_allocation_test(void) {
// Fast allocation on stack
int array[ARRAY_SIZE];
// Initialize array
for (int i = 0; i < ARRAY_SIZE; i++) {
array[i] = i;
}
}
// Heap allocation
void heap_allocation_test(void) {
// Slower allocation on heap
int *array = malloc(ARRAY_SIZE * sizeof(int));
if (array) {
// Initialize array
for (int i = 0; i < ARRAY_SIZE; i++) {
array[i] = i;
}
free(array);
}
}
void allocation_performance_comparison(void) {
clock_t start, end;
// Stack allocation timing
start = clock();
for (int i = 0; i < ITERATIONS; i++) {
stack_allocation_test();
}
end = clock();
double stack_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Heap allocation timing
start = clock();
for (int i = 0; i < ITERATIONS; i++) {
heap_allocation_test();
}
end = clock();
double heap_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Stack allocation time: %.4f seconds\n", stack_time);
printf("Heap allocation time: %.4f seconds\n", heap_time);
printf("Stack is %.2fx faster\n", heap_time / stack_time);
}Data Structure Optimization
Choosing the right data structures and organizing them for cache efficiency can dramatically improve performance.
Structure of Arrays vs Array of Structures
The way data is organized in memory can significantly impact cache performance:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define NUM_POINTS 1000000
// Array of Structures (AoS) - cache-unfriendly for component-wise operations
typedef struct {
float x, y, z;
} point_aos_t;
// Structure of Arrays (SoA) - cache-friendly for component-wise operations
typedef struct {
float *x, *y, *z;
} point_soa_t;
// Initialize AoS
point_aos_t* init_aos(int count) {
point_aos_t *points = malloc(count * sizeof(point_aos_t));
for (int i = 0; i < count; i++) {
points[i].x = (float)i;
points[i].y = (float)(i * 2);
points[i].z = (float)(i * 3);
}
return points;
}
// Initialize SoA
point_soa_t* init_soa(int count) {
point_soa_t *points = malloc(sizeof(point_soa_t));
points->x = malloc(count * sizeof(float));
points->y = malloc(count * sizeof(float));
points->z = malloc(count * sizeof(float));
for (int i = 0; i < count; i++) {
points->x[i] = (float)i;
points->y[i] = (float)(i * 2);
points->z[i] = (float)(i * 3);
}
return points;
}
// Process all x coordinates (AoS version)
float process_x_aos(point_aos_t *points, int count) {
float sum = 0.0f;
for (int i = 0; i < count; i++) {
sum += points[i].x * 2.0f;
}
return sum;
}
// Process all x coordinates (SoA version)
float process_x_soa(point_soa_t *points, int count) {
float sum = 0.0f;
for (int i = 0; i < count; i++) {
sum += points->x[i] * 2.0f;
}
return sum;
}
void aos_vs_soa_comparison(void) {
point_aos_t *aos_points = init_aos(NUM_POINTS);
point_soa_t *soa_points = init_soa(NUM_POINTS);
clock_t start, end;
// AoS timing
start = clock();
volatile float aos_result = process_x_aos(aos_points, NUM_POINTS);
end = clock();
double aos_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// SoA timing
start = clock();
volatile float soa_result = process_x_soa(soa_points, NUM_POINTS);
end = clock();
double soa_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("AoS processing time: %.4f seconds\n", aos_time);
printf("SoA processing time: %.4f seconds\n", soa_time);
printf("SoA is %.2fx faster\n", aos_time / soa_time);
// Cleanup
free(aos_points);
free(soa_points->x);
free(soa_points->y);
free(soa_points->z);
free(soa_points);
}Memory Alignment
Proper memory alignment can improve performance by ensuring efficient memory access:
#include <stdio.h>
#include <stdlib.h>
#include <stdalign.h>
#include <time.h>
#define ARRAY_SIZE 1000000
// Unaligned structure
typedef struct {
char a; // 1 byte
int b; // 4 bytes (3 bytes padding before)
short c; // 2 bytes
// 1 byte padding at end
} unaligned_struct_t;
// Aligned structure
typedef struct {
alignas(16) int b; // 4 bytes, aligned to 16-byte boundary
char a; // 1 byte
short c; // 2 bytes
// 9 bytes padding at end for 16-byte alignment
} aligned_struct_t;
void alignment_performance_test(void) {
// Allocate arrays
unaligned_struct_t *unaligned_array = malloc(ARRAY_SIZE * sizeof(unaligned_struct_t));
aligned_struct_t *aligned_array = malloc(ARRAY_SIZE * sizeof(aligned_struct_t));
clock_t start, end;
// Test unaligned access
start = clock();
for (int i = 0; i < ARRAY_SIZE; i++) {
unaligned_array[i].b = i * 2;
}
end = clock();
double unaligned_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Test aligned access
start = clock();
for (int i = 0; i < ARRAY_SIZE; i++) {
aligned_array[i].b = i * 2;
}
end = clock();
double aligned_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Unaligned access time: %.4f seconds\n", unaligned_time);
printf("Aligned access time: %.4f seconds\n", aligned_time);
printf("Aligned is %.2fx faster\n", unaligned_time / aligned_time);
// Print structure sizes
printf("Unaligned struct size: %zu bytes\n", sizeof(unaligned_struct_t));
printf("Aligned struct size: %zu bytes\n", sizeof(aligned_struct_t));
free(unaligned_array);
free(aligned_array);
}Memory Layout Optimization
Understanding and optimizing memory layout can lead to significant performance improvements.
Data Locality
Keeping related data close together in memory improves cache performance:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define MATRIX_ROWS 1000
#define MATRIX_COLS 1000
// Poor data locality - accessing data with large strides
void poor_locality_example(int **matrix) {
long sum = 0;
// Access data column by column (poor cache usage)
for (int col = 0; col < MATRIX_COLS; col++) {
for (int row = 0; row < MATRIX_ROWS; row++) {
sum += matrix[row][col];
}
}
// Prevent optimization
volatile long dummy = sum;
}
// Good data locality - accessing data sequentially
void good_locality_example(int **matrix) {
long sum = 0;
// Access data row by row (good cache usage)
for (int row = 0; row < MATRIX_ROWS; row++) {
for (int col = 0; col < MATRIX_COLS; col++) {
sum += matrix[row][col];
}
}
// Prevent optimization
volatile long dummy = sum;
}
void data_locality_comparison(void) {
// Allocate matrix
int **matrix = malloc(MATRIX_ROWS * sizeof(int*));
for (int i = 0; i < MATRIX_ROWS; i++) {
matrix[i] = malloc(MATRIX_COLS * sizeof(int));
for (int j = 0; j < MATRIX_COLS; j++) {
matrix[i][j] = i * MATRIX_COLS + j;
}
}
clock_t start, end;
// Poor locality timing
start = clock();
poor_locality_example(matrix);
end = clock();
double poor_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Good locality timing
start = clock();
good_locality_example(matrix);
end = clock();
double good_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Poor locality time: %.4f seconds\n", poor_time);
printf("Good locality time: %.4f seconds\n", good_time);
printf("Good locality is %.2fx faster\n", poor_time / good_time);
// Cleanup
for (int i = 0; i < MATRIX_ROWS; i++) {
free(matrix[i]);
}
free(matrix);
}Memory Pooling
Memory pooling can reduce allocation overhead and improve data locality:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define POOL_SIZE 1024 * 1024 // 1MB pool
#define BLOCK_SIZE 256
typedef struct memory_block {
struct memory_block *next;
size_t size;
int in_use;
char data[1]; // Flexible array member
} memory_block_t;
typedef struct {
char *pool_start;
char *pool_end;
memory_block_t *free_list;
size_t total_size;
} memory_pool_t;
// Initialize memory pool
memory_pool_t* pool_create(size_t size) {
memory_pool_t *pool = malloc(sizeof(memory_pool_t));
if (!pool) return NULL;
pool->pool_start = malloc(size);
if (!pool->pool_start) {
free(pool);
return NULL;
}
pool->pool_end = pool->pool_start + size;
pool->total_size = size;
pool->free_list = (memory_block_t*)pool->pool_start;
// Initialize first block to cover entire pool
pool->free_list->next = NULL;
pool->free_list->size = size - sizeof(memory_block_t);
pool->free_list->in_use = 0;
return pool;
}
// Allocate memory from pool
void* pool_malloc(memory_pool_t *pool, size_t size) {
if (!pool || size == 0) return NULL;
// Add space for block header
size_t required_size = sizeof(memory_block_t) + size;
// Find suitable block
memory_block_t *prev = NULL;
memory_block_t *current = pool->free_list;
while (current) {
if (!current->in_use && current->size >= size) {
// Found suitable block
current->in_use = 1;
// Split block if large enough
size_t remaining = current->size - size;
if (remaining > sizeof(memory_block_t) + 32) { // Minimum block size
memory_block_t *new_block = (memory_block_t*)
((char*)current + sizeof(memory_block_t) + size);
new_block->size = remaining - sizeof(memory_block_t);
new_block->in_use = 0;
new_block->next = current->next;
current->size = size;
current->next = new_block;
}
return current->data;
}
prev = current;
current = current->next;
}
return NULL; // No suitable block found
}
// Free memory back to pool
void pool_free(memory_pool_t *pool, void *ptr) {
if (!pool || !ptr) return;
// Get block header
memory_block_t *block = (memory_block_t*)((char*)ptr - sizeof(memory_block_t));
block->in_use = 0;
// Coalesce adjacent free blocks
memory_block_t *current = pool->free_list;
while (current) {
if (!current->in_use) {
// Check next block
memory_block_t *next = current->next;
if (next && !next->in_use) {
// Merge with next block
current->size += sizeof(memory_block_t) + next->size;
current->next = next->next;
continue; // Check same block again
}
}
current = current->next;
}
}
// Destroy memory pool
void pool_destroy(memory_pool_t *pool) {
if (pool) {
free(pool->pool_start);
free(pool);
}
}
// Example usage
void memory_pool_example(void) {
memory_pool_t *pool = pool_create(POOL_SIZE);
if (!pool) {
printf("Failed to create memory pool\n");
return;
}
printf("Created memory pool of %zu bytes\n", POOL_SIZE);
// Allocate some blocks
void *ptr1 = pool_malloc(pool, 128);
void *ptr2 = pool_malloc(pool, 256);
void *ptr3 = pool_malloc(pool, 64);
printf("Allocated 3 blocks\n");
// Free some blocks
pool_free(pool, ptr1);
pool_free(pool, ptr3);
printf("Freed 2 blocks\n");
// Allocate more blocks (should reuse freed space)
void *ptr4 = pool_malloc(pool, 96);
void *ptr5 = pool_malloc(pool, 160);
printf("Allocated 2 more blocks\n");
// Cleanup
pool_destroy(pool);
}Memory Access Patterns
Understanding and optimizing memory access patterns can significantly improve performance.
Sequential vs Random Access
Sequential memory access is much faster than random access due to cache behavior:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define ARRAY_SIZE 1000000
#define ACCESS_COUNT 1000000
void sequential_access_test(int *array) {
long sum = 0;
for (int i = 0; i < ARRAY_SIZE; i++) {
sum += array[i];
}
// Prevent optimization
volatile long dummy = sum;
}
void random_access_test(int *array) {
long sum = 0;
// Access elements in random order
for (int i = 0; i < ARRAY_SIZE; i++) {
int index = rand() % ARRAY_SIZE;
sum += array[index];
}
// Prevent optimization
volatile long dummy = sum;
}
void access_pattern_comparison(void) {
int *array = malloc(ARRAY_SIZE * sizeof(int));
if (!array) {
printf("Failed to allocate array\n");
return;
}
// Initialize array
for (int i = 0; i < ARRAY_SIZE; i++) {
array[i] = i;
}
clock_t start, end;
// Sequential access timing
start = clock();
sequential_access_test(array);
end = clock();
double sequential_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// Random access timing
srand(42); // Fixed seed for reproducible results
start = clock();
random_access_test(array);
end = clock();
double random_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Sequential access time: %.4f seconds\n", sequential_time);
printf("Random access time: %.4f seconds\n", random_time);
printf("Sequential is %.2fx faster\n", random_time / sequential_time);
free(array);
}Practical Memory Optimization Techniques
Reducing Memory Fragmentation
Memory fragmentation can lead to inefficient memory usage and allocation failures:
#include <stdio.h>
#include <stdlib.h>
#define POOL_SIZE 10000
// Simple example showing fragmentation
void fragmentation_example(void) {
void *ptrs[1000];
printf("Demonstrating memory fragmentation:\n");
// Allocate many small blocks
for (int i = 0; i < 1000; i++) {
ptrs[i] = malloc(10);
if (!ptrs[i]) {
printf("Allocation failed at %d\n", i);
return;
}
}
// Free every other block
for (int i = 0; i < 1000; i += 2) {
free(ptrs[i]);
ptrs[i] = NULL;
}
// Try to allocate a larger block
void *large_block = malloc(500);
if (large_block) {
printf("Large block allocation succeeded\n");
free(large_block);
} else {
printf("Large block allocation failed due to fragmentation\n");
}
// Free remaining blocks
for (int i = 1; i < 1000; i += 2) {
if (ptrs[i]) {
free(ptrs[i]);
}
}
}Memory Prefetching
Modern processors support prefetching instructions to improve memory access performance:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define ARRAY_SIZE 1000000
// Manual prefetching example
void prefetch_example(int *array) {
long sum = 0;
for (int i = 0; i < ARRAY_SIZE; i++) {
// Prefetch next cache line (typically 64 bytes)
if (i + 16 < ARRAY_SIZE) {
__builtin_prefetch(&array[i + 16], 0, 3);
}
sum += array[i];
}
// Prevent optimization
volatile long dummy = sum;
}
// Without prefetching
void no_prefetch_example(int *array) {
long sum = 0;
for (int i = 0; i < ARRAY_SIZE; i++) {
sum += array[i];
}
// Prevent optimization
volatile long dummy = sum;
}
void prefetch_comparison(void) {
int *array = malloc(ARRAY_SIZE * sizeof(int));
if (!array) {
printf("Failed to allocate array\n");
return;
}
// Initialize array
for (int i = 0; i < ARRAY_SIZE; i++) {
array[i] = i;
}
clock_t start, end;
// Without prefetching
start = clock();
no_prefetch_example(array);
end = clock();
double no_prefetch_time = ((double)(end - start)) / CLOCKS_PER_SEC;
// With prefetching
start = clock();
prefetch_example(array);
end = clock();
double prefetch_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("Without prefetching: %.4f seconds\n", no_prefetch_time);
printf("With prefetching: %.4f seconds\n", prefetch_time);
printf("Prefetching improvement: %.2fx\n", no_prefetch_time / prefetch_time);
free(array);
}Memory Debugging and Analysis
Tools and techniques for analyzing memory usage and identifying optimization opportunities.
Simple Memory Tracker
A basic memory tracking system to monitor allocations:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Simple memory tracker
typedef struct mem_block {
struct mem_block *next;
void *ptr;
size_t size;
const char *file;
int line;
} mem_block_t;
static mem_block_t *mem_list = NULL;
static size_t total_allocated = 0;
static size_t peak_allocated = 0;
static size_t allocation_count = 0;
// Track memory allocation
void* tracked_malloc(size_t size, const char *file, int line) {
void *ptr = malloc(size);
if (!ptr) return NULL;
// Add to tracking list
mem_block_t *block = malloc(sizeof(mem_block_t));
if (!block) {
free(ptr);
return NULL;
}
block->ptr = ptr;
block->size = size;
block->file = file;
block->line = line;
block->next = mem_list;
mem_list = block;
total_allocated += size;
allocation_count++;
if (total_allocated > peak_allocated) {
peak_allocated = total_allocated;
}
return ptr;
}
// Track memory deallocation
void tracked_free(void *ptr, const char *file, int line) {
if (!ptr) return;
// Find and remove from tracking list
mem_block_t *prev = NULL;
mem_block_t *current = mem_list;
while (current) {
if (current->ptr == ptr) {
if (prev) {
prev->next = current->next;
} else {
mem_list = current->next;
}
total_allocated -= current->size;
free(current);
free(ptr);
return;
}
prev = current;
current = current->next;
}
// Not found in tracking list - possible double free
printf("Warning: Attempt to free untracked pointer at %s:%d\n", file, line);
free(ptr);
}
// Print memory statistics
void print_memory_stats(void) {
printf("\n=== Memory Statistics ===\n");
printf("Total allocations: %zu\n", allocation_count);
printf("Currently allocated: %zu bytes\n", total_allocated);
printf("Peak allocation: %zu bytes\n", peak_allocated);
if (mem_list) {
printf("Leaked blocks:\n");
mem_block_t *current = mem_list;
while (current) {
printf(" %p: %zu bytes at %s:%d\n",
current->ptr, current->size, current->file, current->line);
current = current->next;
}
} else {
printf("No memory leaks detected\n");
}
}
// Cleanup tracking (call at program exit)
void cleanup_memory_tracking(void) {
mem_block_t *current = mem_list;
while (current) {
mem_block_t *next = current->next;
free(current->ptr);
free(current);
current = next;
}
mem_list = NULL;
}
// Macros for easier usage
#define MALLOC(size) tracked_malloc(size, __FILE__, __LINE__)
#define FREE(ptr) tracked_free(ptr, __FILE__, __LINE__)
// Example usage
void memory_tracking_example(void) {
printf("Memory tracking example:\n");
// Allocate some memory
int *array1 = (int*)MALLOC(100 * sizeof(int));
char *string = (char*)MALLOC(50);
// Use the memory
for (int i = 0; i < 100; i++) {
array1[i] = i;
}
strcpy(string, "Hello, World!");
printf("Allocated array and string\n");
print_memory_stats();
// Free one block
FREE(array1);
printf("Freed array\n");
print_memory_stats();
// Intentionally leak string to demonstrate leak detection
printf("Intentionally leaking string...\n");
print_memory_stats();
// Cleanup
cleanup_memory_tracking();
}Summary
Memory optimization is a multifaceted discipline that requires understanding of:
- Memory Hierarchy - How different levels of memory affect performance
- Allocation Strategies - Custom allocators, stack vs heap allocation
- Data Structure Design - Structure of Arrays vs Array of Structures, alignment
- Memory Layout - Data locality, memory pooling
- Access Patterns - Sequential vs random access, prefetching
- Fragmentation Management - Reducing memory fragmentation
- Analysis Tools - Memory tracking and debugging
Key principles for effective memory optimization:
- Profile First - Measure before and after optimization to verify improvements
- Understand Your Data - Know how your data is accessed and sized
- Consider the Cache - Design for cache-friendly access patterns
- Minimize Allocations - Reduce allocation frequency and overhead
- Track Memory Usage - Monitor for leaks and fragmentation
- Use Appropriate Tools - Leverage profilers and analysis tools
These techniques, when applied appropriately, can lead to significant performance improvements in C programs, especially in resource-constrained or performance-critical applications.