Entropy and Information Content
Entropy is the fundamental measure of information content and uncertainty in information theory. It quantifies how much information is contained in a message or how uncertain we are about the outcome of a random variable.
Information Content
Self-Information
The information content of an event is inversely related to its probability:
I(x) = -log₂(P(x))
import numpy as np
import matplotlib.pyplot as plt
def information_content(probability):
"""Calculate information content in bits"""
return -np.log2(probability)
# Example: Information content of different events
= [
events "Coin flip (heads)", 0.5),
("Rolling a 6", 1/6),
("Drawing ace of spades", 1/52),
("Winning lottery", 1e-8)
(
]
print("Information Content Examples:")
for event, prob in events:
= information_content(prob)
info print(f"{event}: P = {prob:.2e}, I = {info:.2f} bits")
# Visualize information content vs probability
= np.logspace(-6, 0, 1000) # From 10^-6 to 1
probs = information_content(probs)
info_content
=(10, 6))
plt.figure(figsize'b-', linewidth=2)
plt.semilogx(probs, info_content, 'Probability')
plt.xlabel('Information Content (bits)')
plt.ylabel('Information Content vs Probability')
plt.title(True)
plt.grid( plt.show()
Shannon Entropy
Definition
For a discrete random variable X with possible values {x₁, x₂, …, xₙ}:
H(X) = -Σ P(xᵢ) log₂ P(xᵢ)
def shannon_entropy(probabilities):
"""Calculate Shannon entropy in bits"""
# Remove zero probabilities to avoid log(0)
= np.array(probabilities)
probs = probs[probs > 0]
probs return -np.sum(probs * np.log2(probs))
# Example 1: Fair coin
= [0.5, 0.5]
fair_coin print(f"Fair coin entropy: {shannon_entropy(fair_coin):.3f} bits")
# Example 2: Biased coin
= [0.9, 0.1]
biased_coin print(f"Biased coin entropy: {shannon_entropy(biased_coin):.3f} bits")
# Example 3: Fair die
= [1/6] * 6
fair_die print(f"Fair die entropy: {shannon_entropy(fair_die):.3f} bits")
# Example 4: Loaded die
= [0.5, 0.1, 0.1, 0.1, 0.1, 0.1]
loaded_die print(f"Loaded die entropy: {shannon_entropy(loaded_die):.3f} bits")
Properties of Entropy
def plot_binary_entropy():
"""Plot entropy of binary random variable"""
= np.linspace(0.001, 0.999, 1000)
p = -p * np.log2(p) - (1-p) * np.log2(1-p)
entropy
=(10, 6))
plt.figure(figsize'b-', linewidth=2)
plt.plot(p, entropy, =1, color='r', linestyle='--', alpha=0.7, label='Maximum entropy')
plt.axhline(y=0.5, color='r', linestyle='--', alpha=0.7)
plt.axvline(x'Probability p')
plt.xlabel('Entropy H(X) (bits)')
plt.ylabel('Binary Entropy Function')
plt.title(True)
plt.grid(
plt.legend()
plt.show()
print(f"Maximum entropy occurs at p = 0.5: {shannon_entropy([0.5, 0.5]):.3f} bits")
plot_binary_entropy()
Entropy in Data Analysis
Text Analysis
from collections import Counter
import string
def text_entropy(text):
"""Calculate entropy of text based on character frequencies"""
# Convert to lowercase and remove punctuation
= text.lower().translate(str.maketrans('', '', string.punctuation))
text = text.replace(' ', '') # Remove spaces for character-level analysis
text
# Count character frequencies
= Counter(text)
char_counts = len(text)
total_chars
# Calculate probabilities
= [count / total_chars for count in char_counts.values()]
probabilities
return shannon_entropy(probabilities), char_counts
# Example texts
= [
texts "English text", "The quick brown fox jumps over the lazy dog"),
("Repetitive text", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"),
("Random text", "xqzjkwpvmcnbghytrueioadslf"),
("Structured text", "abcabcabcabcabcabcabcabcabcabc")
(
]
print("Text Entropy Analysis:")
for name, text in texts:
= text_entropy(text)
entropy, char_counts print(f"{name}: {entropy:.3f} bits per character")
print(f" Unique characters: {len(char_counts)}")
print(f" Most common: {char_counts.most_common(3)}")
print()
Image Entropy
def image_entropy(image):
"""Calculate entropy of image pixel values"""
# Flatten image and get pixel value counts
= image.flatten()
pixels = np.bincount(pixels)
pixel_counts
# Calculate probabilities (excluding zero counts)
= pixel_counts[pixel_counts > 0] / len(pixels)
probabilities
return shannon_entropy(probabilities)
# Create sample images with different entropy levels
def create_sample_images():
# Low entropy: mostly uniform
= np.full((100, 100), 128, dtype=np.uint8)
low_entropy += np.random.randint(-10, 10, (100, 100))
low_entropy
# Medium entropy: gradient
= np.meshgrid(np.linspace(0, 255, 100), np.linspace(0, 255, 100))
x, y = ((x + y) / 2).astype(np.uint8)
medium_entropy
# High entropy: random noise
= np.random.randint(0, 256, (100, 100), dtype=np.uint8)
high_entropy
return low_entropy, medium_entropy, high_entropy
= create_sample_images()
low_ent_img, med_ent_img, high_ent_img
# Calculate entropies
= [
entropies "Low entropy", low_ent_img),
("Medium entropy", med_ent_img),
("High entropy", high_ent_img)
(
]
= plt.subplots(1, 3, figsize=(15, 5))
fig, axes
for i, (name, img) in enumerate(entropies):
= image_entropy(img)
entropy ='gray')
axes[i].imshow(img, cmapf'{name}\nH = {entropy:.2f} bits')
axes[i].set_title('off')
axes[i].axis(
plt.tight_layout() plt.show()
Cross-Entropy and KL Divergence
Cross-Entropy
Cross-entropy measures the average number of bits needed to encode events from distribution P using a code optimized for distribution Q:
H(P,Q) = -Σ P(x) log₂ Q(x)
def cross_entropy(p, q):
"""Calculate cross-entropy between distributions p and q"""
= np.array(p), np.array(q)
p, q # Avoid log(0) by adding small epsilon
= 1e-15
epsilon = np.clip(q, epsilon, 1)
q return -np.sum(p * np.log2(q))
# Example: True vs predicted distributions
= [0.7, 0.2, 0.1]
true_dist = [0.6, 0.3, 0.1] # Close to true
pred_dist1 = [0.1, 0.1, 0.8] # Far from true
pred_dist2
print(f"True distribution entropy: {shannon_entropy(true_dist):.3f} bits")
print(f"Cross-entropy (close prediction): {cross_entropy(true_dist, pred_dist1):.3f} bits")
print(f"Cross-entropy (poor prediction): {cross_entropy(true_dist, pred_dist2):.3f} bits")
Kullback-Leibler (KL) Divergence
KL divergence measures how different two probability distributions are:
D_KL(P||Q) = Σ P(x) log₂(P(x)/Q(x))
def kl_divergence(p, q):
"""Calculate KL divergence from q to p"""
= np.array(p), np.array(q)
p, q = 1e-15
epsilon = np.clip(p, epsilon, 1)
p = np.clip(q, epsilon, 1)
q return np.sum(p * np.log2(p / q))
# Properties of KL divergence
print("KL Divergence Properties:")
print(f"D_KL(P||P) = {kl_divergence(true_dist, true_dist):.6f} (always 0)")
print(f"D_KL(P||Q1) = {kl_divergence(true_dist, pred_dist1):.3f}")
print(f"D_KL(P||Q2) = {kl_divergence(true_dist, pred_dist2):.3f}")
print(f"D_KL(Q1||P) = {kl_divergence(pred_dist1, true_dist):.3f} (asymmetric)")
# Relationship: H(P,Q) = H(P) + D_KL(P||Q)
= shannon_entropy(true_dist)
h_p = cross_entropy(true_dist, pred_dist1)
h_pq1 = kl_divergence(true_dist, pred_dist1)
kl_pq1
print(f"\nRelationship verification:")
print(f"H(P) = {h_p:.3f}")
print(f"H(P,Q) = {h_pq1:.3f}")
print(f"D_KL(P||Q) = {kl_pq1:.3f}")
print(f"H(P) + D_KL(P||Q) = {h_p + kl_pq1:.3f}")
Applications in Machine Learning
Decision Trees and Information Gain
def information_gain(parent, children):
"""Calculate information gain for a split"""
= shannon_entropy(parent)
parent_entropy
# Weighted average of children entropies
= sum(len(child) for child in children)
total_samples = sum(
weighted_entropy len(child) / total_samples) * shannon_entropy(child)
(for child in children
)
return parent_entropy - weighted_entropy
# Example: Binary classification dataset
# Class distribution before split
= [0.6, 0.4] # 60% class 0, 40% class 1
parent_classes
# After split on feature
= [0.9, 0.1] # 90% class 0, 10% class 1
left_child = [0.2, 0.8] # 20% class 0, 80% class 1
right_child
# Assume equal-sized children for simplicity
= [left_child, right_child]
children
= information_gain(parent_classes, children)
gain print(f"Information gain from split: {gain:.3f} bits")
# Compare with different splits
print("\nComparing different splits:")
= [
splits "Good split", [[0.9, 0.1], [0.2, 0.8]]),
("Poor split", [[0.55, 0.45], [0.65, 0.35]]),
("Perfect split", [[1.0, 0.0], [0.0, 1.0]])
(
]
for name, children in splits:
= information_gain(parent_classes, children)
gain print(f"{name}: {gain:.3f} bits")
Feature Selection using Mutual Information
from sklearn.feature_selection import mutual_info_classif
from sklearn.datasets import make_classification
# Generate sample dataset
= make_classification(n_samples=1000, n_features=10, n_informative=5,
X, y =2, n_clusters_per_class=1, random_state=42)
n_redundant
# Calculate mutual information between features and target
= mutual_info_classif(X, y, random_state=42)
mi_scores
# Visualize feature importance
=(10, 6))
plt.figure(figsize= [f'Feature {i}' for i in range(X.shape[1])]
feature_names
plt.bar(feature_names, mi_scores)'Features')
plt.xlabel('Mutual Information')
plt.ylabel('Feature Importance using Mutual Information')
plt.title(=45)
plt.xticks(rotationTrue, alpha=0.3)
plt.grid(
plt.tight_layout()
plt.show()
print("Feature ranking by mutual information:")
for i, score in sorted(enumerate(mi_scores), key=lambda x: x[1], reverse=True):
print(f"Feature {i}: {score:.3f}")
Entropy in Compression
Huffman Coding Efficiency
import heapq
from collections import defaultdict, Counter
class HuffmanNode:
def __init__(self, char=None, freq=0, left=None, right=None):
self.char = char
self.freq = freq
self.left = left
self.right = right
def __lt__(self, other):
return self.freq < other.freq
def build_huffman_tree(text):
"""Build Huffman tree from text"""
# Count character frequencies
= Counter(text)
freq
# Create priority queue
= [HuffmanNode(char, freq) for char, freq in freq.items()]
heap
heapq.heapify(heap)
# Build tree
while len(heap) > 1:
= heapq.heappop(heap)
left = heapq.heappop(heap)
right = HuffmanNode(freq=left.freq + right.freq, left=left, right=right)
merged
heapq.heappush(heap, merged)
return heap[0] if heap else None
def get_huffman_codes(root):
"""Get Huffman codes from tree"""
if not root:
return {}
= {}
codes
def traverse(node, code=""):
if node.char is not None: # Leaf node
= code or "0" # Handle single character case
codes[node.char] else:
+ "0")
traverse(node.left, code + "1")
traverse(node.right, code
traverse(root)return codes
def analyze_compression_efficiency(text):
"""Analyze Huffman coding efficiency"""
# Calculate entropy
= Counter(text)
char_counts = len(text)
total_chars = [count / total_chars for count in char_counts.values()]
probabilities = shannon_entropy(probabilities)
entropy
# Build Huffman tree and get codes
= build_huffman_tree(text)
root = get_huffman_codes(root)
codes
# Calculate average code length
= sum(char_counts[char] * len(code) for char, code in codes.items()) / total_chars
avg_length
# Calculate compression ratio
= len(text) * 8 # ASCII encoding
original_bits = sum(char_counts[char] * len(codes[char]) for char in char_counts)
compressed_bits = compressed_bits / original_bits
compression_ratio
print(f"Text: '{text[:50]}{'...' if len(text) > 50 else ''}'")
print(f"Entropy: {entropy:.3f} bits per symbol")
print(f"Average Huffman code length: {avg_length:.3f} bits per symbol")
print(f"Efficiency: {entropy / avg_length:.3f} ({entropy / avg_length * 100:.1f}%)")
print(f"Compression ratio: {compression_ratio:.3f} ({compression_ratio * 100:.1f}%)")
print(f"Space saved: {(1 - compression_ratio) * 100:.1f}%")
print()
return entropy, avg_length, codes
# Test with different types of text
= [
test_texts "AAAAAAAAAA", # Low entropy
"ABABABAB", # Medium entropy
"ABCDEFGHIJ", # High entropy
"The quick brown fox jumps over the lazy dog" # Natural text
]
for text in test_texts:
analyze_compression_efficiency(text)
Practical Applications
Password Strength Analysis
import string
import math
def password_entropy(password):
"""Estimate password entropy"""
# Determine character set size
= 0
charset_size if any(c.islower() for c in password):
+= 26 # lowercase
charset_size if any(c.isupper() for c in password):
+= 26 # uppercase
charset_size if any(c.isdigit() for c in password):
+= 10 # digits
charset_size if any(c in string.punctuation for c in password):
+= len(string.punctuation) # special characters
charset_size
# Calculate entropy assuming uniform distribution
= len(password) * math.log2(charset_size)
entropy
return entropy, charset_size
def analyze_password_strength(passwords):
"""Analyze strength of different passwords"""
print("Password Strength Analysis:")
print("-" * 60)
for pwd in passwords:
= password_entropy(pwd)
entropy, charset
# Strength categories
if entropy < 30:
= "Very Weak"
strength elif entropy < 50:
= "Weak"
strength elif entropy < 70:
= "Moderate"
strength elif entropy < 90:
= "Strong"
strength else:
= "Very Strong"
strength
print(f"Password: {'*' * len(pwd)} (length: {len(pwd)})")
print(f"Charset size: {charset}")
print(f"Entropy: {entropy:.1f} bits")
print(f"Strength: {strength}")
print(f"Time to crack (1B guesses/sec): {2**(entropy-1) / 1e9:.2e} seconds")
print()
# Test passwords
= [
test_passwords "123456",
"password",
"Password1",
"P@ssw0rd!",
"MyVeryLongAndComplexPassword123!@#"
]
analyze_password_strength(test_passwords)
Network Protocol Efficiency
def protocol_efficiency_analysis():
"""Analyze efficiency of different encoding schemes"""
# Simulate message types and their frequencies
= {
message_types 'ACK': 0.4, # Acknowledgment
'DATA': 0.3, # Data packet
'NACK': 0.1, # Negative acknowledgment
'PING': 0.1, # Ping
'CLOSE': 0.05, # Close connection
'ERROR': 0.05 # Error message
}
# Fixed-length encoding (3 bits per message type)
= 3
fixed_length
# Calculate optimal variable-length encoding
= list(message_types.values())
probabilities = shannon_entropy(probabilities)
entropy
# Huffman-like optimal encoding lengths
# Sort by probability (descending)
= sorted(message_types.items(), key=lambda x: x[1], reverse=True)
sorted_types
# Assign code lengths (simplified Huffman)
= {
optimal_codes 0][0]: 1, # Most frequent: 1 bit
sorted_types[1][0]: 2, # Second: 2 bits
sorted_types[2][0]: 3, # Third: 3 bits
sorted_types[3][0]: 3, # Fourth: 3 bits
sorted_types[4][0]: 4, # Fifth: 4 bits
sorted_types[5][0]: 4 # Sixth: 4 bits
sorted_types[
}
# Calculate average lengths
= fixed_length
avg_fixed = sum(message_types[msg] * length for msg, length in optimal_codes.items())
avg_optimal
print("Network Protocol Encoding Analysis:")
print("-" * 50)
print(f"Message entropy: {entropy:.3f} bits")
print(f"Fixed-length encoding: {avg_fixed} bits per message")
print(f"Optimal variable-length: {avg_optimal:.3f} bits per message")
print(f"Efficiency gain: {(avg_fixed - avg_optimal) / avg_fixed * 100:.1f}%")
print()
print("Message type frequencies and optimal codes:")
for msg, prob in sorted_types:
print(f"{msg}: {prob:.2%} -> {optimal_codes[msg]} bits")
protocol_efficiency_analysis()
Summary
Entropy and information content are fundamental concepts that:
- Quantify Information: Measure how much information is contained in data
- Guide Compression: Determine theoretical limits of data compression
- Optimize Communication: Design efficient encoding schemes
- Analyze Uncertainty: Measure randomness and predictability
- Feature Selection: Identify informative features in machine learning
- Security Analysis: Evaluate password strength and cryptographic systems
Understanding these concepts enables you to: - Design efficient data compression algorithms - Analyze the information content of datasets - Optimize communication protocols - Build better machine learning models - Evaluate security systems