Statistical Foundations for Data Science
Statistics provides the mathematical framework for understanding data, quantifying uncertainty, and making data-driven decisions. This section covers fundamental statistical concepts essential for data science.
Descriptive Statistics
Measures of Central Tendency
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
# Generate sample data
42)
np.random.seed(= np.random.normal(100, 15, 1000) # Normal distribution
data = np.random.exponential(2, 1000) # Skewed distribution
skewed_data
def calculate_central_tendency(data, name):
"""Calculate and display measures of central tendency"""
= np.mean(data)
mean = np.median(data)
median = stats.mode(data, keepdims=True)
mode_result = mode_result.mode[0] if len(mode_result.mode) > 0 else np.nan
mode
print(f"\n{name}:")
print(f"Mean: {mean:.2f}")
print(f"Median: {median:.2f}")
print(f"Mode: {mode:.2f}")
return mean, median, mode
# Analyze both datasets
= calculate_central_tendency(data, "Normal Distribution")
mean1, median1, mode1 = calculate_central_tendency(skewed_data, "Skewed Distribution")
mean2, median2, mode2
# Visualize the differences
= plt.subplots(1, 2, figsize=(15, 6))
fig, (ax1, ax2)
# Normal distribution
=50, alpha=0.7, density=True, color='skyblue')
ax1.hist(data, bins='red', linestyle='--', label=f'Mean: {mean1:.1f}')
ax1.axvline(mean1, color='green', linestyle='--', label=f'Median: {median1:.1f}')
ax1.axvline(median1, color'Normal Distribution')
ax1.set_title(
ax1.legend()True, alpha=0.3)
ax1.grid(
# Skewed distribution
=50, alpha=0.7, density=True, color='lightcoral')
ax2.hist(skewed_data, bins='red', linestyle='--', label=f'Mean: {mean2:.1f}')
ax2.axvline(mean2, color='green', linestyle='--', label=f'Median: {median2:.1f}')
ax2.axvline(median2, color'Skewed Distribution')
ax2.set_title(
ax2.legend()True, alpha=0.3)
ax2.grid(
plt.tight_layout() plt.show()
Measures of Variability
def calculate_variability(data, name):
"""Calculate measures of variability"""
= np.var(data, ddof=1) # Sample variance
variance = np.std(data, ddof=1) # Sample standard deviation
std_dev = np.max(data) - np.min(data)
range_val = np.percentile(data, 75) - np.percentile(data, 25)
iqr = np.median(np.abs(data - np.median(data))) # Median Absolute Deviation
mad
print(f"\n{name} - Variability Measures:")
print(f"Variance: {variance:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Range: {range_val:.2f}")
print(f"Interquartile Range (IQR): {iqr:.2f}")
print(f"Median Absolute Deviation: {mad:.2f}")
return variance, std_dev, range_val, iqr, mad
# Calculate variability for both datasets
"Normal Distribution")
calculate_variability(data, "Skewed Distribution")
calculate_variability(skewed_data,
# Box plots to visualize variability
=(12, 6))
plt.figure(figsize1, 2, 1)
plt.subplot(=['Normal'])
plt.boxplot(data, labels'Normal Distribution')
plt.title(True, alpha=0.3)
plt.grid(
1, 2, 2)
plt.subplot(=['Skewed'])
plt.boxplot(skewed_data, labels'Skewed Distribution')
plt.title(True, alpha=0.3)
plt.grid(
plt.tight_layout() plt.show()
Distribution Shape
def analyze_distribution_shape(data, name):
"""Analyze skewness and kurtosis"""
= stats.skew(data)
skewness = stats.kurtosis(data)
kurt
print(f"\n{name} - Shape Measures:")
print(f"Skewness: {skewness:.3f}")
if skewness > 0.5:
print(" → Right-skewed (positive skew)")
elif skewness < -0.5:
print(" → Left-skewed (negative skew)")
else:
print(" → Approximately symmetric")
print(f"Kurtosis: {kurt:.3f}")
if kurt > 0:
print(" → Leptokurtic (heavy tails)")
elif kurt < 0:
print(" → Platykurtic (light tails)")
else:
print(" → Mesokurtic (normal tails)")
return skewness, kurt
"Normal Distribution")
analyze_distribution_shape(data, "Skewed Distribution") analyze_distribution_shape(skewed_data,
Probability Fundamentals
Basic Probability Rules
def demonstrate_probability_rules():
"""Demonstrate basic probability rules with examples"""
# Simulate coin flips
= 10000
n_flips = np.random.choice(['H', 'T'], n_flips)
coin_flips
# Calculate probabilities
= np.sum(coin_flips == 'H') / n_flips
p_heads = np.sum(coin_flips == 'T') / n_flips
p_tails
print("Basic Probability Rules:")
print(f"P(Heads) = {p_heads:.3f}")
print(f"P(Tails) = {p_tails:.3f}")
print(f"P(Heads) + P(Tails) = {p_heads + p_tails:.3f}")
print("Rule: Sum of all probabilities = 1")
# Simulate dice rolls
= np.random.randint(1, 7, n_flips)
dice_rolls
# Joint probability example
= dice_rolls % 2 == 0
even_rolls = dice_rolls >= 4
high_rolls
= np.mean(even_rolls)
p_even = np.mean(high_rolls)
p_high = np.mean(even_rolls & high_rolls)
p_even_and_high = np.mean(even_rolls | high_rolls)
p_even_or_high
print(f"\nDice Roll Probabilities:")
print(f"P(Even) = {p_even:.3f}")
print(f"P(High ≥ 4) = {p_high:.3f}")
print(f"P(Even AND High) = {p_even_and_high:.3f}")
print(f"P(Even OR High) = {p_even_or_high:.3f}")
# Verify addition rule: P(A ∪ B) = P(A) + P(B) - P(A ∩ B)
= p_even + p_high - p_even_and_high
calculated_or print(f"Calculated P(Even OR High) = {calculated_or:.3f}")
print(f"Difference: {abs(p_even_or_high - calculated_or):.6f}")
demonstrate_probability_rules()
Conditional Probability and Bayes’ Theorem
def bayes_theorem_example():
"""Medical diagnosis example using Bayes' theorem"""
# Disease prevalence and test accuracy
= 0.01 # 1% of population has disease
disease_prevalence = 0.95 # 95% true positive rate
test_sensitivity = 0.90 # 90% true negative rate
test_specificity
# Calculate probabilities
= disease_prevalence
p_disease = 1 - disease_prevalence
p_no_disease = test_sensitivity
p_positive_given_disease = 1 - test_specificity
p_positive_given_no_disease
# Total probability of positive test
= (p_positive_given_disease * p_disease +
p_positive * p_no_disease)
p_positive_given_no_disease
# Bayes' theorem: P(Disease|Positive) = P(Positive|Disease) * P(Disease) / P(Positive)
= (p_positive_given_disease * p_disease) / p_positive
p_disease_given_positive
print("Bayes' Theorem Example - Medical Diagnosis:")
print(f"Disease prevalence: {disease_prevalence:.1%}")
print(f"Test sensitivity: {test_sensitivity:.1%}")
print(f"Test specificity: {test_specificity:.1%}")
print(f"\nP(Positive test) = {p_positive:.3f}")
print(f"P(Disease | Positive test) = {p_disease_given_positive:.3f} ({p_disease_given_positive:.1%})")
print(f"\nInterpretation:")
print(f"Even with a positive test, there's only a {p_disease_given_positive:.1%} chance of having the disease!")
print(f"This is due to the low base rate (prevalence) of the disease.")
return p_disease_given_positive
# Simulate the medical test scenario
def simulate_medical_test(n_people=100000):
"""Simulate medical testing scenario"""
# Generate population
= np.random.random(n_people) < 0.01 # 1% prevalence
has_disease
# Generate test results
= np.zeros(n_people, dtype=bool)
test_results
# People with disease: 95% test positive
= np.where(has_disease)[0]
disease_indices = np.random.random(len(disease_indices)) < 0.95
test_results[disease_indices]
# People without disease: 10% test positive (false positives)
= np.where(~has_disease)[0]
no_disease_indices = np.random.random(len(no_disease_indices)) < 0.10
test_results[no_disease_indices]
# Calculate actual probabilities from simulation
= test_results
positive_tests = has_disease & positive_tests
true_positives = (~has_disease) & positive_tests
false_positives
= np.sum(true_positives) / np.sum(positive_tests)
simulated_p_disease_given_positive
print(f"\nSimulation Results (n={n_people:,}):")
print(f"People with disease: {np.sum(has_disease):,}")
print(f"Positive tests: {np.sum(positive_tests):,}")
print(f"True positives: {np.sum(true_positives):,}")
print(f"False positives: {np.sum(false_positives):,}")
print(f"Simulated P(Disease | Positive) = {simulated_p_disease_given_positive:.3f}")
= bayes_theorem_example()
theoretical_prob simulate_medical_test()
Sampling and Sampling Distributions
Central Limit Theorem
def demonstrate_central_limit_theorem():
"""Demonstrate the Central Limit Theorem"""
# Create different population distributions
= {
populations 'Uniform': np.random.uniform(0, 10, 10000),
'Exponential': np.random.exponential(2, 10000),
'Bimodal': np.concatenate([np.random.normal(2, 1, 5000),
8, 1, 5000)])
np.random.normal(
}
= [5, 10, 30, 100]
sample_sizes = 1000
n_samples
= plt.subplots(len(populations), len(sample_sizes) + 1,
fig, axes =(20, 12))
figsize
for i, (pop_name, population) in enumerate(populations.items()):
# Plot original population
0].hist(population, bins=50, alpha=0.7, density=True)
axes[i, 0].set_title(f'{pop_name} Population')
axes[i, 0].grid(True, alpha=0.3)
axes[i,
# For each sample size, create sampling distribution
for j, sample_size in enumerate(sample_sizes):
= []
sample_means
for _ in range(n_samples):
= np.random.choice(population, sample_size)
sample
sample_means.append(np.mean(sample))
= np.array(sample_means)
sample_means
# Plot sampling distribution
+ 1].hist(sample_means, bins=30, alpha=0.7, density=True)
axes[i, j + 1].set_title(f'Sample Means (n={sample_size})')
axes[i, j + 1].grid(True, alpha=0.3)
axes[i, j
# Add normal curve overlay
= np.linspace(sample_means.min(), sample_means.max(), 100)
x = stats.norm.pdf(x, np.mean(sample_means), np.std(sample_means))
normal_curve + 1].plot(x, normal_curve, 'r-', linewidth=2, alpha=0.8)
axes[i, j
# Calculate and display statistics
= np.mean(sample_means)
mean_of_means = np.std(sample_means)
std_of_means = np.std(population) / np.sqrt(sample_size)
theoretical_std
+ 1].text(0.02, 0.98,
axes[i, j f'Mean: {mean_of_means:.2f}\nStd: {std_of_means:.2f}\nTheoretical: {theoretical_std:.2f}',
=axes[i, j + 1].transAxes,
transform='top',
verticalalignment=dict(boxstyle='round', facecolor='white', alpha=0.8))
bbox
plt.tight_layout()
plt.show()
print("Central Limit Theorem Observations:")
print("1. As sample size increases, sampling distribution becomes more normal")
print("2. Mean of sampling distribution equals population mean")
print("3. Standard error = population std / sqrt(sample size)")
print("4. This holds regardless of the original population distribution!")
demonstrate_central_limit_theorem()
Confidence Intervals
def confidence_intervals_analysis():
"""Analyze confidence intervals and their interpretation"""
# True population parameters
= 100
true_mean = 15
true_std
# Generate samples and calculate confidence intervals
= [10, 30, 100, 500]
sample_sizes = [0.90, 0.95, 0.99]
confidence_levels = 1000
n_simulations
= {}
results
for sample_size in sample_sizes:
for conf_level in confidence_levels:
= 0
coverage_count = []
intervals
for _ in range(n_simulations):
# Generate sample
= np.random.normal(true_mean, true_std, sample_size)
sample = np.mean(sample)
sample_mean = np.std(sample, ddof=1)
sample_std
# Calculate confidence interval
= 1 - conf_level
alpha = stats.t.ppf(1 - alpha/2, df=sample_size - 1)
t_critical = t_critical * (sample_std / np.sqrt(sample_size))
margin_error
= sample_mean - margin_error
ci_lower = sample_mean + margin_error
ci_upper
intervals.append((ci_lower, ci_upper))
# Check if interval contains true mean
if ci_lower <= true_mean <= ci_upper:
+= 1
coverage_count
= coverage_count / n_simulations
coverage_rate = {
results[(sample_size, conf_level)] 'coverage_rate': coverage_rate,
'intervals': intervals
}
# Display results
print("Confidence Interval Coverage Analysis:")
print("=" * 60)
print(f"{'Sample Size':>12} {'Conf Level':>12} {'Coverage Rate':>15} {'Expected':>10}")
print("-" * 60)
for (sample_size, conf_level), result in results.items():
= result['coverage_rate']
coverage_rate print(f"{sample_size:>12} {conf_level:>12.0%} {coverage_rate:>15.1%} {conf_level:>10.0%}")
# Visualize confidence intervals for one case
= 30
sample_size = 0.95
conf_level = results[(sample_size, conf_level)]['intervals'][:50] # Show first 50
intervals
=(12, 8))
plt.figure(figsize
for i, (lower, upper) in enumerate(intervals):
= 'green' if lower <= true_mean <= upper else 'red'
color =color, linewidth=2, alpha=0.7)
plt.plot([lower, upper], [i, i], color+ upper) / 2], [i], 'o', color=color, markersize=3)
plt.plot([(lower
='blue', linestyle='--', linewidth=2,
plt.axvline(true_mean, color=f'True Mean = {true_mean}')
label'Value')
plt.xlabel('Sample Number')
plt.ylabel(f'{conf_level:.0%} Confidence Intervals (n={sample_size})')
plt.title(
plt.legend()True, alpha=0.3)
plt.grid(
plt.show()
print(f"\nInterpretation:")
print(f"- Green intervals contain the true mean ({true_mean})")
print(f"- Red intervals do not contain the true mean")
print(f"- About {conf_level:.0%} of intervals should be green")
confidence_intervals_analysis()
Statistical Inference
Hypothesis Testing Framework
def hypothesis_testing_framework():
"""Demonstrate hypothesis testing concepts"""
print("Hypothesis Testing Framework:")
print("=" * 50)
# Example: Testing if a coin is fair
print("Example: Testing if a coin is fair")
print("H₀: p = 0.5 (coin is fair)")
print("H₁: p ≠ 0.5 (coin is biased)")
print()
# Simulate coin flips
= 100
n_flips = 0.6 # Biased coin (unknown to us)
true_p = np.random.binomial(n_flips, true_p)
observed_heads = observed_heads / n_flips
observed_p
print(f"Observed: {observed_heads} heads out of {n_flips} flips")
print(f"Sample proportion: {observed_p:.3f}")
# Calculate test statistic (z-test for proportion)
= 0.5
null_p = np.sqrt(null_p * (1 - null_p) / n_flips)
standard_error = (observed_p - null_p) / standard_error
z_statistic
# Calculate p-value (two-tailed test)
= 2 * (1 - stats.norm.cdf(abs(z_statistic)))
p_value
print(f"\nTest Results:")
print(f"Z-statistic: {z_statistic:.3f}")
print(f"P-value: {p_value:.4f}")
# Decision
= 0.05
alpha if p_value < alpha:
print(f"Decision: Reject H₀ (p-value < {alpha})")
print("Conclusion: Evidence suggests the coin is biased")
else:
print(f"Decision: Fail to reject H₀ (p-value ≥ {alpha})")
print("Conclusion: Insufficient evidence that the coin is biased")
return z_statistic, p_value
= hypothesis_testing_framework() z_stat, p_val
Type I and Type II Errors
def error_types_analysis():
"""Analyze Type I and Type II errors"""
# Simulation parameters
= 100 # H₀: μ = 100
null_mean = 30
sample_size = 0.05
alpha = 10000
n_simulations
# Test different true means to see error rates
= np.arange(95, 106, 0.5)
true_means
= []
type_i_errors = []
type_ii_errors = []
power_values
for true_mean in true_means:
= 0
reject_count
for _ in range(n_simulations):
# Generate sample from true distribution
= np.random.normal(true_mean, 15, sample_size)
sample = np.mean(sample)
sample_mean
# Perform t-test
= (sample_mean - null_mean) / (15 / np.sqrt(sample_size))
t_statistic = 2 * (1 - stats.t.cdf(abs(t_statistic), df=sample_size - 1))
p_value
if p_value < alpha:
+= 1
reject_count
= reject_count / n_simulations
rejection_rate
if true_mean == null_mean:
# Type I error rate (rejecting true null)
= rejection_rate
type_i_error_rate
type_i_errors.append(type_i_error_rate)else:
# Power (correctly rejecting false null)
= rejection_rate
power
power_values.append(power)
# Type II error rate (failing to reject false null)
= 1 - power
type_ii_error_rate
type_ii_errors.append(type_ii_error_rate)
# Plot power curve
=(12, 8))
plt.figure(figsize
2, 1, 1)
plt.subplot(!= null_mean], power_values, 'b-', linewidth=2, label='Power')
plt.plot(true_means[true_means ='red', linestyle='--', label=f'α = {alpha}')
plt.axhline(alpha, color='gray', linestyle=':', alpha=0.7, label='H₀: μ = 100')
plt.axvline(null_mean, color'True Mean')
plt.xlabel('Power (1 - β)')
plt.ylabel('Statistical Power Curve')
plt.title(
plt.legend()True, alpha=0.3)
plt.grid(
2, 1, 2)
plt.subplot(!= null_mean], type_ii_errors, 'r-', linewidth=2, label='Type II Error Rate (β)')
plt.plot(true_means[true_means ='blue', linestyle='--', label=f'α = {alpha}')
plt.axhline(alpha, color='gray', linestyle=':', alpha=0.7, label='H₀: μ = 100')
plt.axvline(null_mean, color'True Mean')
plt.xlabel('Type II Error Rate (β)')
plt.ylabel('Type II Error Rate')
plt.title(
plt.legend()True, alpha=0.3)
plt.grid(
plt.tight_layout()
plt.show()
print("Error Types Analysis:")
print(f"Type I Error Rate (α): {type_i_error_rate:.3f} (should be ≈ {alpha})")
print(f"Type II Error Rate varies with effect size")
print(f"Power increases as true mean moves away from null hypothesis")
# Effect size and power relationship
= np.abs(true_means[true_means != null_mean] - null_mean) / 15
effect_sizes
=(10, 6))
plt.figure(figsize'go-', linewidth=2, markersize=6)
plt.plot(effect_sizes, power_values, 'Effect Size (Cohen\'s d)')
plt.xlabel('Statistical Power')
plt.ylabel('Power vs Effect Size')
plt.title(True, alpha=0.3)
plt.grid(0.8, color='red', linestyle='--', label='Conventional Power = 0.8')
plt.axhline(
plt.legend()
plt.show()
error_types_analysis()
Correlation and Causation
Correlation Analysis
def correlation_analysis():
"""Comprehensive correlation analysis"""
# Generate different types of relationships
= 1000
n = np.random.normal(0, 1, n)
x
# Different correlation patterns
= {
relationships 'Strong Positive': x + 0.2 * np.random.normal(0, 1, n),
'Weak Positive': x + 2 * np.random.normal(0, 1, n),
'No Correlation': np.random.normal(0, 1, n),
'Strong Negative': -x + 0.2 * np.random.normal(0, 1, n),
'Non-linear': x**2 + 0.5 * np.random.normal(0, 1, n)
}
= plt.subplots(2, 3, figsize=(18, 12))
fig, axes = axes.flatten()
axes
= {}
correlations
for i, (name, y) in enumerate(relationships.items()):
if i < len(axes):
# Calculate correlations
= stats.pearsonr(x, y)
pearson_r, pearson_p = stats.spearmanr(x, y)
spearman_r, spearman_p
= {
correlations[name] 'pearson': pearson_r,
'spearman': spearman_r,
'pearson_p': pearson_p,
'spearman_p': spearman_p
}
# Plot
=0.6, s=20)
axes[i].scatter(x, y, alphaf'{name}\nPearson r = {pearson_r:.3f}\nSpearman ρ = {spearman_r:.3f}')
axes[i].set_title(True, alpha=0.3)
axes[i].grid(
# Add trend line for linear relationships
if abs(pearson_r) > 0.3:
= np.polyfit(x, y, 1)
z = np.poly1d(z)
p sorted(x), p(sorted(x)), "r--", alpha=0.8)
axes[i].plot(
# Remove empty subplot
if len(relationships) < len(axes):
-1])
fig.delaxes(axes[
plt.tight_layout()
plt.show()
# Display correlation summary
print("Correlation Analysis Summary:")
print("=" * 60)
print(f"{'Relationship':>15} {'Pearson r':>12} {'p-value':>10} {'Spearman ρ':>12} {'p-value':>10}")
print("-" * 60)
for name, corr in correlations.items():
print(f"{name:>15} {corr['pearson']:>12.3f} {corr['pearson_p']:>10.4f} "
f"{corr['spearman']:>12.3f} {corr['spearman_p']:>10.4f}")
print("\nInterpretation Guidelines:")
print("Pearson correlation (r):")
print(" |r| < 0.3: Weak relationship")
print(" 0.3 ≤ |r| < 0.7: Moderate relationship")
print(" |r| ≥ 0.7: Strong relationship")
print("\nSpearman correlation (ρ): Measures monotonic relationships")
print("Use when relationship is not linear but monotonic")
correlation_analysis()
Spurious Correlations and Confounding
def spurious_correlation_example():
"""Demonstrate spurious correlations and confounding variables"""
= 1000
n
# Example 1: Ice cream sales and drowning deaths
# Both are caused by temperature (confounding variable)
= np.random.normal(75, 10, n) # Temperature in Fahrenheit
temperature
# Ice cream sales increase with temperature
= 50 + 2 * (temperature - 70) + np.random.normal(0, 10, n)
ice_cream_sales = np.maximum(ice_cream_sales, 0) # Can't be negative
ice_cream_sales
# Drowning deaths also increase with temperature (more swimming)
= 2 + 0.1 * (temperature - 70) + np.random.poisson(1, n)
drowning_deaths = np.maximum(drowning_deaths, 0)
drowning_deaths
# Calculate correlations
= stats.pearsonr(ice_cream_sales, drowning_deaths)[0]
corr_ice_drowning = stats.pearsonr(temperature, ice_cream_sales)[0]
corr_temp_ice = stats.pearsonr(temperature, drowning_deaths)[0]
corr_temp_drowning
# Partial correlation (controlling for temperature)
from scipy.stats import pearsonr
# Simple partial correlation calculation
def partial_correlation(x, y, z):
"""Calculate partial correlation of x and y controlling for z"""
= pearsonr(x, y)[0]
rxy = pearsonr(x, z)[0]
rxz = pearsonr(y, z)[0]
ryz
= (rxy - rxz * ryz) / (np.sqrt(1 - rxz**2) * np.sqrt(1 - ryz**2))
partial_r return partial_r
= partial_correlation(ice_cream_sales, drowning_deaths, temperature)
partial_corr
# Visualize
= plt.subplots(2, 2, figsize=(15, 12))
fig, axes
# Ice cream vs drowning (spurious correlation)
0, 0].scatter(ice_cream_sales, drowning_deaths, alpha=0.6)
axes[0, 0].set_xlabel('Ice Cream Sales')
axes[0, 0].set_ylabel('Drowning Deaths')
axes[0, 0].set_title(f'Spurious Correlation\nr = {corr_ice_drowning:.3f}')
axes[0, 0].grid(True, alpha=0.3)
axes[
# Temperature vs ice cream
0, 1].scatter(temperature, ice_cream_sales, alpha=0.6, color='orange')
axes[0, 1].set_xlabel('Temperature (°F)')
axes[0, 1].set_ylabel('Ice Cream Sales')
axes[0, 1].set_title(f'Temperature → Ice Cream\nr = {corr_temp_ice:.3f}')
axes[0, 1].grid(True, alpha=0.3)
axes[
# Temperature vs drowning
1, 0].scatter(temperature, drowning_deaths, alpha=0.6, color='red')
axes[1, 0].set_xlabel('Temperature (°F)')
axes[1, 0].set_ylabel('Drowning Deaths')
axes[1, 0].set_title(f'Temperature → Drowning\nr = {corr_temp_drowning:.3f}')
axes[1, 0].grid(True, alpha=0.3)
axes[
# Residual plot (controlling for temperature)
= ice_cream_sales - (np.mean(ice_cream_sales) +
ice_cream_residuals * (temperature - np.mean(temperature)))
corr_temp_ice = drowning_deaths - (np.mean(drowning_deaths) +
drowning_residuals * (temperature - np.mean(temperature)))
corr_temp_drowning
1, 1].scatter(ice_cream_residuals, drowning_residuals, alpha=0.6, color='green')
axes[1, 1].set_xlabel('Ice Cream Sales (residuals)')
axes[1, 1].set_ylabel('Drowning Deaths (residuals)')
axes[1, 1].set_title(f'Controlling for Temperature\nPartial r = {partial_corr:.3f}')
axes[1, 1].grid(True, alpha=0.3)
axes[
plt.tight_layout()
plt.show()
print("Spurious Correlation Example:")
print("=" * 50)
print(f"Ice cream sales ↔ Drowning deaths: r = {corr_ice_drowning:.3f}")
print(f"Temperature → Ice cream sales: r = {corr_temp_ice:.3f}")
print(f"Temperature → Drowning deaths: r = {corr_temp_drowning:.3f}")
print(f"Partial correlation (controlling for temperature): r = {partial_corr:.3f}")
print()
print("Key Insight:")
print("The correlation between ice cream sales and drowning deaths")
print("disappears when we control for the confounding variable (temperature).")
print("This demonstrates that correlation ≠ causation!")
spurious_correlation_example()
Summary
Statistical foundations provide the mathematical framework for:
- Data Description: Summarizing and characterizing datasets
- Uncertainty Quantification: Understanding variability and confidence
- Inference: Making conclusions about populations from samples
- Hypothesis Testing: Evaluating claims using statistical evidence
- Relationship Analysis: Understanding associations between variables
Key principles: - Sampling variability affects all statistical conclusions - Central Limit Theorem enables many statistical procedures - Confidence intervals quantify uncertainty in estimates - Hypothesis testing provides a framework for decision-making - Correlation does not imply causation - always consider confounding variables
These concepts form the foundation for more advanced data science techniques including machine learning, experimental design, and causal inference.