Entropy and Information Content

Entropy is the fundamental measure of information content and uncertainty in information theory. It quantifies how much information is contained in a message or how uncertain we are about the outcome of a random variable.

Information Content

Self-Information

The information content of an event is inversely related to its probability:

I(x) = -log₂(P(x))

import numpy as np
import matplotlib.pyplot as plt

def information_content(probability):
    """Calculate information content in bits"""
    return -np.log2(probability)

# Example: Information content of different events
events = [
    ("Coin flip (heads)", 0.5),
    ("Rolling a 6", 1/6),
    ("Drawing ace of spades", 1/52),
    ("Winning lottery", 1e-8)
]

print("Information Content Examples:")
for event, prob in events:
    info = information_content(prob)
    print(f"{event}: P = {prob:.2e}, I = {info:.2f} bits")

# Visualize information content vs probability
probs = np.logspace(-6, 0, 1000)  # From 10^-6 to 1
info_content = information_content(probs)

plt.figure(figsize=(10, 6))
plt.semilogx(probs, info_content, 'b-', linewidth=2)
plt.xlabel('Probability')
plt.ylabel('Information Content (bits)')
plt.title('Information Content vs Probability')
plt.grid(True)
plt.show()

Shannon Entropy

Definition

For a discrete random variable X with possible values {x₁, x₂, …, xₙ}:

H(X) = -Σ P(xᵢ) log₂ P(xᵢ)

def shannon_entropy(probabilities):
    """Calculate Shannon entropy in bits"""
    # Remove zero probabilities to avoid log(0)
    probs = np.array(probabilities)
    probs = probs[probs > 0]
    return -np.sum(probs * np.log2(probs))

# Example 1: Fair coin
fair_coin = [0.5, 0.5]
print(f"Fair coin entropy: {shannon_entropy(fair_coin):.3f} bits")

# Example 2: Biased coin
biased_coin = [0.9, 0.1]
print(f"Biased coin entropy: {shannon_entropy(biased_coin):.3f} bits")

# Example 3: Fair die
fair_die = [1/6] * 6
print(f"Fair die entropy: {shannon_entropy(fair_die):.3f} bits")

# Example 4: Loaded die
loaded_die = [0.5, 0.1, 0.1, 0.1, 0.1, 0.1]
print(f"Loaded die entropy: {shannon_entropy(loaded_die):.3f} bits")

Properties of Entropy

def plot_binary_entropy():
    """Plot entropy of binary random variable"""
    p = np.linspace(0.001, 0.999, 1000)
    entropy = -p * np.log2(p) - (1-p) * np.log2(1-p)

    plt.figure(figsize=(10, 6))
    plt.plot(p, entropy, 'b-', linewidth=2)
    plt.axhline(y=1, color='r', linestyle='--', alpha=0.7, label='Maximum entropy')
    plt.axvline(x=0.5, color='r', linestyle='--', alpha=0.7)
    plt.xlabel('Probability p')
    plt.ylabel('Entropy H(X) (bits)')
    plt.title('Binary Entropy Function')
    plt.grid(True)
    plt.legend()
    plt.show()

    print(f"Maximum entropy occurs at p = 0.5: {shannon_entropy([0.5, 0.5]):.3f} bits")

plot_binary_entropy()

Entropy in Data Analysis

Text Analysis

from collections import Counter
import string

def text_entropy(text):
    """Calculate entropy of text based on character frequencies"""
    # Convert to lowercase and remove punctuation
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    text = text.replace(' ', '')  # Remove spaces for character-level analysis

    # Count character frequencies
    char_counts = Counter(text)
    total_chars = len(text)

    # Calculate probabilities
    probabilities = [count / total_chars for count in char_counts.values()]

    return shannon_entropy(probabilities), char_counts

# Example texts
texts = [
    ("English text", "The quick brown fox jumps over the lazy dog"),
    ("Repetitive text", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
    ("Random text", "xqzjkwpvmcnbghytrueioadslf"),
    ("Structured text", "abcabcabcabcabcabcabcabcabcabc")
]

print("Text Entropy Analysis:")
for name, text in texts:
    entropy, char_counts = text_entropy(text)
    print(f"{name}: {entropy:.3f} bits per character")
    print(f"  Unique characters: {len(char_counts)}")
    print(f"  Most common: {char_counts.most_common(3)}")
    print()

Image Entropy

def image_entropy(image):
    """Calculate entropy of image pixel values"""
    # Flatten image and get pixel value counts
    pixels = image.flatten()
    pixel_counts = np.bincount(pixels)

    # Calculate probabilities (excluding zero counts)
    probabilities = pixel_counts[pixel_counts > 0] / len(pixels)

    return shannon_entropy(probabilities)

# Create sample images with different entropy levels
def create_sample_images():
    # Low entropy: mostly uniform
    low_entropy = np.full((100, 100), 128, dtype=np.uint8)
    low_entropy += np.random.randint(-10, 10, (100, 100))

    # Medium entropy: gradient
    x, y = np.meshgrid(np.linspace(0, 255, 100), np.linspace(0, 255, 100))
    medium_entropy = ((x + y) / 2).astype(np.uint8)

    # High entropy: random noise
    high_entropy = np.random.randint(0, 256, (100, 100), dtype=np.uint8)

    return low_entropy, medium_entropy, high_entropy

low_ent_img, med_ent_img, high_ent_img = create_sample_images()

# Calculate entropies
entropies = [
    ("Low entropy", low_ent_img),
    ("Medium entropy", med_ent_img),
    ("High entropy", high_ent_img)
]

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, (name, img) in enumerate(entropies):
    entropy = image_entropy(img)
    axes[i].imshow(img, cmap='gray')
    axes[i].set_title(f'{name}\nH = {entropy:.2f} bits')
    axes[i].axis('off')

plt.tight_layout()
plt.show()

Cross-Entropy and KL Divergence

Cross-Entropy

Cross-entropy measures the average number of bits needed to encode events from distribution P using a code optimized for distribution Q:

H(P,Q) = -Σ P(x) log₂ Q(x)

def cross_entropy(p, q):
    """Calculate cross-entropy between distributions p and q"""
    p, q = np.array(p), np.array(q)
    # Avoid log(0) by adding small epsilon
    epsilon = 1e-15
    q = np.clip(q, epsilon, 1)
    return -np.sum(p * np.log2(q))

# Example: True vs predicted distributions
true_dist = [0.7, 0.2, 0.1]
pred_dist1 = [0.6, 0.3, 0.1]  # Close to true
pred_dist2 = [0.1, 0.1, 0.8]  # Far from true

print(f"True distribution entropy: {shannon_entropy(true_dist):.3f} bits")
print(f"Cross-entropy (close prediction): {cross_entropy(true_dist, pred_dist1):.3f} bits")
print(f"Cross-entropy (poor prediction): {cross_entropy(true_dist, pred_dist2):.3f} bits")

Kullback-Leibler (KL) Divergence

KL divergence measures how different two probability distributions are:

D_KL(P||Q) = Σ P(x) log₂(P(x)/Q(x))

def kl_divergence(p, q):
    """Calculate KL divergence from q to p"""
    p, q = np.array(p), np.array(q)
    epsilon = 1e-15
    p = np.clip(p, epsilon, 1)
    q = np.clip(q, epsilon, 1)
    return np.sum(p * np.log2(p / q))

# Properties of KL divergence
print("KL Divergence Properties:")
print(f"D_KL(P||P) = {kl_divergence(true_dist, true_dist):.6f} (always 0)")
print(f"D_KL(P||Q1) = {kl_divergence(true_dist, pred_dist1):.3f}")
print(f"D_KL(P||Q2) = {kl_divergence(true_dist, pred_dist2):.3f}")
print(f"D_KL(Q1||P) = {kl_divergence(pred_dist1, true_dist):.3f} (asymmetric)")

# Relationship: H(P,Q) = H(P) + D_KL(P||Q)
h_p = shannon_entropy(true_dist)
h_pq1 = cross_entropy(true_dist, pred_dist1)
kl_pq1 = kl_divergence(true_dist, pred_dist1)

print(f"\nRelationship verification:")
print(f"H(P) = {h_p:.3f}")
print(f"H(P,Q) = {h_pq1:.3f}")
print(f"D_KL(P||Q) = {kl_pq1:.3f}")
print(f"H(P) + D_KL(P||Q) = {h_p + kl_pq1:.3f}")

Applications in Machine Learning

Decision Trees and Information Gain

def information_gain(parent, children):
    """Calculate information gain for a split"""
    parent_entropy = shannon_entropy(parent)

    # Weighted average of children entropies
    total_samples = sum(len(child) for child in children)
    weighted_entropy = sum(
        (len(child) / total_samples) * shannon_entropy(child)
        for child in children
    )

    return parent_entropy - weighted_entropy

# Example: Binary classification dataset
# Class distribution before split
parent_classes = [0.6, 0.4]  # 60% class 0, 40% class 1

# After split on feature
left_child = [0.9, 0.1]   # 90% class 0, 10% class 1
right_child = [0.2, 0.8]  # 20% class 0, 80% class 1

# Assume equal-sized children for simplicity
children = [left_child, right_child]

gain = information_gain(parent_classes, children)
print(f"Information gain from split: {gain:.3f} bits")

# Compare with different splits
print("\nComparing different splits:")
splits = [
    ("Good split", [[0.9, 0.1], [0.2, 0.8]]),
    ("Poor split", [[0.55, 0.45], [0.65, 0.35]]),
    ("Perfect split", [[1.0, 0.0], [0.0, 1.0]])
]

for name, children in splits:
    gain = information_gain(parent_classes, children)
    print(f"{name}: {gain:.3f} bits")

Feature Selection using Mutual Information

from sklearn.feature_selection import mutual_info_classif
from sklearn.datasets import make_classification

# Generate sample dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5,
                          n_redundant=2, n_clusters_per_class=1, random_state=42)

# Calculate mutual information between features and target
mi_scores = mutual_info_classif(X, y, random_state=42)

# Visualize feature importance
plt.figure(figsize=(10, 6))
feature_names = [f'Feature {i}' for i in range(X.shape[1])]
plt.bar(feature_names, mi_scores)
plt.xlabel('Features')
plt.ylabel('Mutual Information')
plt.title('Feature Importance using Mutual Information')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("Feature ranking by mutual information:")
for i, score in sorted(enumerate(mi_scores), key=lambda x: x[1], reverse=True):
    print(f"Feature {i}: {score:.3f}")

Entropy in Compression

Huffman Coding Efficiency

import heapq
from collections import defaultdict, Counter

class HuffmanNode:
    def __init__(self, char=None, freq=0, left=None, right=None):
        self.char = char
        self.freq = freq
        self.left = left
        self.right = right

    def __lt__(self, other):
        return self.freq < other.freq

def build_huffman_tree(text):
    """Build Huffman tree from text"""
    # Count character frequencies
    freq = Counter(text)

    # Create priority queue
    heap = [HuffmanNode(char, freq) for char, freq in freq.items()]
    heapq.heapify(heap)

    # Build tree
    while len(heap) > 1:
        left = heapq.heappop(heap)
        right = heapq.heappop(heap)
        merged = HuffmanNode(freq=left.freq + right.freq, left=left, right=right)
        heapq.heappush(heap, merged)

    return heap[0] if heap else None

def get_huffman_codes(root):
    """Get Huffman codes from tree"""
    if not root:
        return {}

    codes = {}

    def traverse(node, code=""):
        if node.char is not None:  # Leaf node
            codes[node.char] = code or "0"  # Handle single character case
        else:
            traverse(node.left, code + "0")
            traverse(node.right, code + "1")

    traverse(root)
    return codes

def analyze_compression_efficiency(text):
    """Analyze Huffman coding efficiency"""
    # Calculate entropy
    char_counts = Counter(text)
    total_chars = len(text)
    probabilities = [count / total_chars for count in char_counts.values()]
    entropy = shannon_entropy(probabilities)

    # Build Huffman tree and get codes
    root = build_huffman_tree(text)
    codes = get_huffman_codes(root)

    # Calculate average code length
    avg_length = sum(char_counts[char] * len(code) for char, code in codes.items()) / total_chars

    # Calculate compression ratio
    original_bits = len(text) * 8  # ASCII encoding
    compressed_bits = sum(char_counts[char] * len(codes[char]) for char in char_counts)
    compression_ratio = compressed_bits / original_bits

    print(f"Text: '{text[:50]}{'...' if len(text) > 50 else ''}'")
    print(f"Entropy: {entropy:.3f} bits per symbol")
    print(f"Average Huffman code length: {avg_length:.3f} bits per symbol")
    print(f"Efficiency: {entropy / avg_length:.3f} ({entropy / avg_length * 100:.1f}%)")
    print(f"Compression ratio: {compression_ratio:.3f} ({compression_ratio * 100:.1f}%)")
    print(f"Space saved: {(1 - compression_ratio) * 100:.1f}%")
    print()

    return entropy, avg_length, codes

# Test with different types of text
test_texts = [
    "AAAAAAAAAA",  # Low entropy
    "ABABABAB",    # Medium entropy
    "ABCDEFGHIJ",  # High entropy
    "The quick brown fox jumps over the lazy dog"  # Natural text
]

for text in test_texts:
    analyze_compression_efficiency(text)

Practical Applications

Password Strength Analysis

import string
import math

def password_entropy(password):
    """Estimate password entropy"""
    # Determine character set size
    charset_size = 0
    if any(c.islower() for c in password):
        charset_size += 26  # lowercase
    if any(c.isupper() for c in password):
        charset_size += 26  # uppercase
    if any(c.isdigit() for c in password):
        charset_size += 10  # digits
    if any(c in string.punctuation for c in password):
        charset_size += len(string.punctuation)  # special characters

    # Calculate entropy assuming uniform distribution
    entropy = len(password) * math.log2(charset_size)

    return entropy, charset_size

def analyze_password_strength(passwords):
    """Analyze strength of different passwords"""
    print("Password Strength Analysis:")
    print("-" * 60)

    for pwd in passwords:
        entropy, charset = password_entropy(pwd)

        # Strength categories
        if entropy < 30:
            strength = "Very Weak"
        elif entropy < 50:
            strength = "Weak"
        elif entropy < 70:
            strength = "Moderate"
        elif entropy < 90:
            strength = "Strong"
        else:
            strength = "Very Strong"

        print(f"Password: {'*' * len(pwd)} (length: {len(pwd)})")
        print(f"Charset size: {charset}")
        print(f"Entropy: {entropy:.1f} bits")
        print(f"Strength: {strength}")
        print(f"Time to crack (1B guesses/sec): {2**(entropy-1) / 1e9:.2e} seconds")
        print()

# Test passwords
test_passwords = [
    "123456",
    "password",
    "Password1",
    "P@ssw0rd!",
    "MyVeryLongAndComplexPassword123!@#"
]

analyze_password_strength(test_passwords)

Network Protocol Efficiency

def protocol_efficiency_analysis():
    """Analyze efficiency of different encoding schemes"""

    # Simulate message types and their frequencies
    message_types = {
        'ACK': 0.4,      # Acknowledgment
        'DATA': 0.3,     # Data packet
        'NACK': 0.1,     # Negative acknowledgment
        'PING': 0.1,     # Ping
        'CLOSE': 0.05,   # Close connection
        'ERROR': 0.05    # Error message
    }

    # Fixed-length encoding (3 bits per message type)
    fixed_length = 3

    # Calculate optimal variable-length encoding
    probabilities = list(message_types.values())
    entropy = shannon_entropy(probabilities)

    # Huffman-like optimal encoding lengths
    # Sort by probability (descending)
    sorted_types = sorted(message_types.items(), key=lambda x: x[1], reverse=True)

    # Assign code lengths (simplified Huffman)
    optimal_codes = {
        sorted_types[0][0]: 1,  # Most frequent: 1 bit
        sorted_types[1][0]: 2,  # Second: 2 bits
        sorted_types[2][0]: 3,  # Third: 3 bits
        sorted_types[3][0]: 3,  # Fourth: 3 bits
        sorted_types[4][0]: 4,  # Fifth: 4 bits
        sorted_types[5][0]: 4   # Sixth: 4 bits
    }

    # Calculate average lengths
    avg_fixed = fixed_length
    avg_optimal = sum(message_types[msg] * length for msg, length in optimal_codes.items())

    print("Network Protocol Encoding Analysis:")
    print("-" * 50)
    print(f"Message entropy: {entropy:.3f} bits")
    print(f"Fixed-length encoding: {avg_fixed} bits per message")
    print(f"Optimal variable-length: {avg_optimal:.3f} bits per message")
    print(f"Efficiency gain: {(avg_fixed - avg_optimal) / avg_fixed * 100:.1f}%")
    print()

    print("Message type frequencies and optimal codes:")
    for msg, prob in sorted_types:
        print(f"{msg}: {prob:.2%} -> {optimal_codes[msg]} bits")

protocol_efficiency_analysis()

Summary

Entropy and information content are fundamental concepts that:

Quantify Information: Measure how much information is contained in data
Guide Compression: Determine theoretical limits of data compression
Optimize Communication: Design efficient encoding schemes
Analyze Uncertainty: Measure randomness and predictability
Feature Selection: Identify informative features in machine learning
Security Analysis: Evaluate password strength and cryptographic systems

Understanding these concepts enables you to: - Design efficient data compression algorithms - Analyze the information content of datasets - Optimize communication protocols - Build better machine learning models - Evaluate security systems