Automation Examples
This section contains practical automation scripts for common system administration and development tasks.
System Administration Automation
Automated Backup System
#!/bin/bash
# automated_backup.sh - Comprehensive backup automation system
# Configuration
BACKUP_CONFIG="${BACKUP_CONFIG:-/etc/backup.conf}"
BACKUP_ROOT="${BACKUP_ROOT:-/backup}"
LOG_FILE="${LOG_FILE:-/var/log/backup.log}"
RETENTION_DAYS="${RETENTION_DAYS:-30}"
COMPRESSION="${COMPRESSION:-gzip}"
ENCRYPTION="${ENCRYPTION:-false}"
GPG_RECIPIENT="${GPG_RECIPIENT:-}"
# Default backup sources
declare -a BACKUP_SOURCES=(
"/home"
"/etc"
"/var/www"
"/opt"
)
# Logging function
log() {
local level="$1"
shift
local message="$*"
echo "$(date '+%Y-%m-%d %H:%M:%S') [$level] $message" | tee -a "$LOG_FILE"
}
# Load configuration
load_config() {
if [ -f "$BACKUP_CONFIG" ]; then
source "$BACKUP_CONFIG"
log INFO "Configuration loaded from $BACKUP_CONFIG"
else
log WARN "Configuration file not found, using defaults"
fi
}
# Create backup directory structure
setup_backup_dirs() {
local timestamp=$(date +%Y%m%d_%H%M%S)
BACKUP_DIR="$BACKUP_ROOT/$timestamp"
mkdir -p "$BACKUP_DIR"
log INFO "Created backup directory: $BACKUP_DIR"
}
# Backup database
backup_database() {
local db_type="$1"
local db_name="$2"
local db_user="$3"
local db_host="${4:-localhost}"
log INFO "Starting database backup: $db_type/$db_name"
case "$db_type" in
mysql)
if command -v mysqldump >/dev/null 2>&1; then
mysqldump -h "$db_host" -u "$db_user" -p"$DB_PASSWORD" "$db_name" > "$BACKUP_DIR/${db_name}.sql"
log INFO "MySQL database backup completed: $db_name"
else
log ERROR "mysqldump not found"
return 1
fi
;;
postgresql)
if command -v pg_dump >/dev/null 2>&1; then
PGPASSWORD="$DB_PASSWORD" pg_dump -h "$db_host" -U "$db_user" "$db_name" > "$BACKUP_DIR/${db_name}.sql"
log INFO "PostgreSQL database backup completed: $db_name"
else
log ERROR "pg_dump not found"
return 1
fi
;;
*)
log ERROR "Unsupported database type: $db_type"
return 1
;;
esac
}
# Backup files and directories
backup_files() {
local source="$1"
local exclude_file="/tmp/backup_exclude.$$"
# Create exclude file
cat > "$exclude_file" << EOF
*.tmp
*.log
*.cache
*~
.DS_Store
Thumbs.db
EOF
if [ -d "$source" ]; then
local backup_name=$(basename "$source")
local archive_file="$BACKUP_DIR/${backup_name}.tar"
log INFO "Backing up directory: $source"
if tar --exclude-from="$exclude_file" -cf "$archive_file" "$source"; then
# Compress if requested
if [ "$COMPRESSION" = "gzip" ]; then
gzip "$archive_file"
archive_file="${archive_file}.gz"
elif [ "$COMPRESSION" = "bzip2" ]; then
bzip2 "$archive_file"
archive_file="${archive_file}.bz2"
fi
# Encrypt if requested
if [ "$ENCRYPTION" = "true" ] && [ -n "$GPG_RECIPIENT" ]; then
gpg --trust-model always --encrypt -r "$GPG_RECIPIENT" "$archive_file"
rm "$archive_file"
archive_file="${archive_file}.gpg"
fi
log INFO "Backup completed: $archive_file"
else
log ERROR "Backup failed for: $source"
fi
else
log WARN "Source not found: $source"
fi
rm -f "$exclude_file"
}
# Verify backup integrity
verify_backup() {
local backup_file="$1"
log INFO "Verifying backup: $backup_file"
case "$backup_file" in
*.tar.gz)
if gzip -t "$backup_file" && tar -tzf "$backup_file" >/dev/null; then
log INFO "Backup verification successful: $backup_file"
return 0
fi
;;
*.tar.bz2)
if bzip2 -t "$backup_file" && tar -tjf "$backup_file" >/dev/null; then
log INFO "Backup verification successful: $backup_file"
return 0
fi
;;
*.tar)
if tar -tf "$backup_file" >/dev/null; then
log INFO "Backup verification successful: $backup_file"
return 0
fi
;;
*.gpg)
if gpg --list-packets "$backup_file" >/dev/null 2>&1; then
log INFO "Encrypted backup verification successful: $backup_file"
return 0
fi
;;
esac
log ERROR "Backup verification failed: $backup_file"
return 1
}
# Clean old backups
cleanup_old_backups() {
log INFO "Cleaning up backups older than $RETENTION_DAYS days"
local deleted_count=0
find "$BACKUP_ROOT" -maxdepth 1 -type d -name "20*" -mtime +$RETENTION_DAYS | while read -r old_backup; do
log INFO "Removing old backup: $old_backup"
rm -rf "$old_backup"
((deleted_count++))
done
log INFO "Cleanup completed, removed $deleted_count old backups"
}
# Send notification
send_notification() {
local status="$1"
local summary="$2"
if [ -n "$NOTIFICATION_EMAIL" ] && command -v mail >/dev/null 2>&1; then
{
echo "Backup Status: $status"
echo "Time: $(date)"
echo "Summary: $summary"
echo
echo "Recent log entries:"
tail -20 "$LOG_FILE"
} | mail -s "Backup Report - $status" "$NOTIFICATION_EMAIL"
log INFO "Notification sent to $NOTIFICATION_EMAIL"
fi
if [ -n "$SLACK_WEBHOOK" ] && command -v curl >/dev/null 2>&1; then
local color="good"
[ "$status" != "SUCCESS" ] && color="danger"
curl -X POST -H 'Content-type: application/json' \
--data "{\"attachments\":[{\"color\":\"$color\",\"title\":\"Backup Report\",\"text\":\"Status: $status\\nSummary: $summary\"}]}" \
"$SLACK_WEBHOOK" >/dev/null 2>&1
log INFO "Slack notification sent"
fi
}
# Generate backup report
generate_report() {
local report_file="$BACKUP_DIR/backup_report.txt"
local total_size=0
local file_count=0
{
echo "Backup Report"
echo "============="
echo "Date: $(date)"
echo "Backup Directory: $BACKUP_DIR"
echo
echo "Backed up files:"
echo "==============="
for file in "$BACKUP_DIR"/*; do
if [ -f "$file" ] && [[ "$(basename "$file")" != "backup_report.txt" ]]; then
local size=$(stat -c%s "$file" 2>/dev/null || stat -f%z "$file" 2>/dev/null)
local human_size=$(numfmt --to=iec-i --suffix=B "$size")
echo "$(basename "$file"): $human_size"
total_size=$((total_size + size))
((file_count++))
fi
done
echo
echo "Summary:"
echo "========"
echo "Total files: $file_count"
echo "Total size: $(numfmt --to=iec-i --suffix=B $total_size)"
echo "Compression: $COMPRESSION"
echo "Encryption: $ENCRYPTION"
} > "$report_file"
log INFO "Backup report generated: $report_file"
}
# Main backup process
main() {
log INFO "Starting automated backup process"
load_config
setup_backup_dirs
local backup_success=true
local backed_up_items=0
# Backup files and directories
for source in "${BACKUP_SOURCES[@]}"; do
if backup_files "$source"; then
((backed_up_items++))
else
backup_success=false
fi
done
# Backup databases if configured
if [ -n "$DATABASES" ]; then
IFS=',' read -ra DB_LIST <<< "$DATABASES"
for db_config in "${DB_LIST[@]}"; do
IFS=':' read -ra DB_PARTS <<< "$db_config"
if [ ${#DB_PARTS[@]} -ge 3 ]; then
if backup_database "${DB_PARTS[0]}" "${DB_PARTS[1]}" "${DB_PARTS[2]}" "${DB_PARTS[3]:-localhost}"; then
((backed_up_items++))
else
backup_success=false
fi
fi
done
fi
# Verify backups
for backup_file in "$BACKUP_DIR"/*; do
if [ -f "$backup_file" ] && [[ "$(basename "$backup_file")" != "backup_report.txt" ]]; then
if ! verify_backup "$backup_file"; then
backup_success=false
fi
fi
done
# Generate report
generate_report
# Cleanup old backups
cleanup_old_backups
# Send notification
if [ "$backup_success" = true ]; then
send_notification "SUCCESS" "Backup completed successfully ($backed_up_items items)"
log INFO "Backup process completed successfully"
else
send_notification "FAILED" "Backup completed with errors"
log ERROR "Backup process completed with errors"
exit 1
fi
}
# Handle command line arguments
case "${1:-backup}" in
backup)
main
;;
config)
cat > "$BACKUP_CONFIG" << EOF
# Backup Configuration File
# Generated on $(date)
# Backup sources (space-separated)
BACKUP_SOURCES=(
"/home"
"/etc"
"/var/www"
"/opt"
)
# Database backups (format: type:name:user:host)
# DATABASES="mysql:myapp:dbuser:localhost,postgresql:webapp:pguser:localhost"
# Backup settings
RETENTION_DAYS=30
COMPRESSION="gzip" # gzip, bzip2, or none
ENCRYPTION="false"
GPG_RECIPIENT=""
# Notifications
NOTIFICATION_EMAIL=""
SLACK_WEBHOOK=""
# Database credentials
DB_PASSWORD=""
EOF
echo "Configuration template created: $BACKUP_CONFIG"
;;
restore)
echo "Restore functionality not implemented yet"
;;
*)
echo "Usage: $0 {backup|config|restore}"
echo "Environment variables:"
echo " BACKUP_CONFIG - Configuration file path"
echo " BACKUP_ROOT - Root backup directory"
echo " LOG_FILE - Log file path"
exit 1
;;
esacSystem Health Monitor
#!/bin/bash
# health_monitor.sh - Comprehensive system health monitoring
MONITOR_CONFIG="${MONITOR_CONFIG:-/etc/health_monitor.conf}"
LOG_FILE="${LOG_FILE:-/var/log/health_monitor.log}"
ALERT_LOG="${ALERT_LOG:-/var/log/health_alerts.log}"
CHECK_INTERVAL="${CHECK_INTERVAL:-300}" # 5 minutes
# Thresholds
CPU_THRESHOLD="${CPU_THRESHOLD:-80}"
MEMORY_THRESHOLD="${MEMORY_THRESHOLD:-85}"
DISK_THRESHOLD="${DISK_THRESHOLD:-90}"
LOAD_THRESHOLD="${LOAD_THRESHOLD:-2.0}"
INODE_THRESHOLD="${INODE_THRESHOLD:-90}"
# Alert settings
ALERT_EMAIL="${ALERT_EMAIL:-}"
ALERT_WEBHOOK="${ALERT_WEBHOOK:-}"
ALERT_COOLDOWN="${ALERT_COOLDOWN:-3600}" # 1 hour
# Services to monitor
CRITICAL_SERVICES=("ssh" "nginx" "apache2" "mysql" "postgresql")
# Logging function
log() {
local level="$1"
shift
local message="$*"
echo "$(date '+%Y-%m-%d %H:%M:%S') [$level] $message" | tee -a "$LOG_FILE"
}
# Alert function
alert() {
local severity="$1"
local message="$2"
local alert_key="$3"
# Check cooldown
local last_alert_file="/tmp/alert_${alert_key}.last"
if [ -f "$last_alert_file" ]; then
local last_alert=$(cat "$last_alert_file")
local current_time=$(date +%s)
local time_diff=$((current_time - last_alert))
if [ $time_diff -lt $ALERT_COOLDOWN ]; then
log DEBUG "Alert suppressed due to cooldown: $alert_key"
return
fi
fi
# Log alert
echo "$(date '+%Y-%m-%d %H:%M:%S') [$severity] $message" >> "$ALERT_LOG"
log WARN "ALERT [$severity]: $message"
# Send notifications
send_alert_notification "$severity" "$message"
# Update cooldown
date +%s > "$last_alert_file"
}
# Send alert notifications
send_alert_notification() {
local severity="$1"
local message="$2"
# Email notification
if [ -n "$ALERT_EMAIL" ] && command -v mail >/dev/null 2>&1; then
{
echo "System Health Alert"
echo "=================="
echo "Severity: $severity"
echo "Time: $(date)"
echo "Host: $(hostname)"
echo
echo "Message: $message"
echo
echo "System Information:"
echo "=================="
uname -a
echo
echo "Load Average:"
uptime
echo
echo "Memory Usage:"
free -h
echo
echo "Disk Usage:"
df -h
} | mail -s "[$severity] System Health Alert - $(hostname)" "$ALERT_EMAIL"
fi
# Webhook notification
if [ -n "$ALERT_WEBHOOK" ] && command -v curl >/dev/null 2>&1; then
local color="warning"
[ "$severity" = "CRITICAL" ] && color="danger"
curl -X POST -H 'Content-type: application/json' \
--data "{\"attachments\":[{\"color\":\"$color\",\"title\":\"System Health Alert\",\"fields\":[{\"title\":\"Severity\",\"value\":\"$severity\",\"short\":true},{\"title\":\"Host\",\"value\":\"$(hostname)\",\"short\":true},{\"title\":\"Message\",\"value\":\"$message\",\"short\":false}]}]}" \
"$ALERT_WEBHOOK" >/dev/null 2>&1
fi
}
# Check CPU usage
check_cpu() {
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
cpu_usage=${cpu_usage%.*}
log INFO "CPU usage: ${cpu_usage}%"
if [ "$cpu_usage" -gt "$CPU_THRESHOLD" ]; then
alert "WARNING" "High CPU usage: ${cpu_usage}% (threshold: ${CPU_THRESHOLD}%)" "cpu_high"
# Get top CPU processes
local top_processes=$(ps aux --sort=-%cpu | head -6 | tail -5 | awk '{print $11 " (" $3 "%)"}' | tr '\n' ', ')
log INFO "Top CPU processes: $top_processes"
fi
echo "cpu_usage:$cpu_usage"
}
# Check memory usage
check_memory() {
local memory_info=$(free | grep Mem)
local total=$(echo $memory_info | awk '{print $2}')
local used=$(echo $memory_info | awk '{print $3}')
local available=$(echo $memory_info | awk '{print $7}')
local memory_usage=$((used * 100 / total))
log INFO "Memory usage: ${memory_usage}% (${used}/${total})"
if [ "$memory_usage" -gt "$MEMORY_THRESHOLD" ]; then
alert "WARNING" "High memory usage: ${memory_usage}% (threshold: ${MEMORY_THRESHOLD}%)" "memory_high"
# Get top memory processes
local top_processes=$(ps aux --sort=-%mem | head -6 | tail -5 | awk '{print $11 " (" $4 "%)"}' | tr '\n' ', ')
log INFO "Top memory processes: $top_processes"
fi
echo "memory_usage:$memory_usage"
echo "memory_available:$available"
}
# Check disk usage
check_disk() {
local alerts_sent=false
df -h | grep -vE '^Filesystem|tmpfs|cdrom' | while read output; do
local usage=$(echo $output | awk '{print $5}' | cut -d'%' -f1)
local partition=$(echo $output | awk '{print $1}')
local mount_point=$(echo $output | awk '{print $6}')
local size=$(echo $output | awk '{print $2}')
local available=$(echo $output | awk '{print $4}')
log INFO "Disk usage $mount_point: ${usage}% ($available available)"
if [ "$usage" -gt "$DISK_THRESHOLD" ]; then
alert "WARNING" "High disk usage on $mount_point: ${usage}% (threshold: ${DISK_THRESHOLD}%)" "disk_${mount_point//\//_}"
alerts_sent=true
fi
echo "disk_usage_${mount_point//\//_}:$usage"
done
# Check inode usage
df -i | grep -vE '^Filesystem|tmpfs|cdrom' | while read output; do
local inode_usage=$(echo $output | awk '{print $5}' | cut -d'%' -f1)
local mount_point=$(echo $output | awk '{print $6}')
if [ "$inode_usage" -gt "$INODE_THRESHOLD" ]; then
alert "WARNING" "High inode usage on $mount_point: ${inode_usage}% (threshold: ${INODE_THRESHOLD}%)" "inode_${mount_point//\//_}"
fi
echo "inode_usage_${mount_point//\//_}:$inode_usage"
done
}
# Check system load
check_load() {
local load_1min=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | tr -d ',')
local cpu_cores=$(nproc)
local load_per_core=$(echo "scale=2; $load_1min / $cpu_cores" | bc -l)
log INFO "Load average: $load_1min (${load_per_core} per core)"
if (( $(echo "$load_per_core > $LOAD_THRESHOLD" | bc -l) )); then
alert "WARNING" "High system load: $load_1min (${load_per_core} per core, threshold: $LOAD_THRESHOLD)" "load_high"
fi
echo "load_1min:$load_1min"
echo "load_per_core:$load_per_core"
}
# Check critical services
check_services() {
local failed_services=()
for service in "${CRITICAL_SERVICES[@]}"; do
if systemctl is-active --quiet "$service" 2>/dev/null; then
log INFO "Service $service: running"
echo "service_${service}:1"
elif systemctl list-unit-files | grep -q "^$service.service"; then
log WARN "Service $service: stopped"
failed_services+=("$service")
echo "service_${service}:0"
else
log DEBUG "Service $service: not installed"
echo "service_${service}:-1"
fi
done
if [ ${#failed_services[@]} -gt 0 ]; then
alert "CRITICAL" "Critical services not running: ${failed_services[*]}" "services_down"
fi
}
# Check network connectivity
check_network() {
local test_hosts=("8.8.8.8" "1.1.1.1")
local failed_hosts=()
for host in "${test_hosts[@]}"; do
if ping -c 1 -W 5 "$host" >/dev/null 2>&1; then
log INFO "Network connectivity to $host: OK"
echo "network_${host//./_}:1"
else
log WARN "Network connectivity to $host: FAILED"
failed_hosts+=("$host")
echo "network_${host//./_}:0"
fi
done
if [ ${#failed_hosts[@]} -gt 0 ]; then
alert "CRITICAL" "Network connectivity issues: ${failed_hosts[*]}" "network_down"
fi
}
# Check SSL certificates
check_ssl_certificates() {
local cert_paths=("/etc/ssl/certs" "/etc/letsencrypt/live")
local expiring_certs=()
for cert_path in "${cert_paths[@]}"; do
if [ -d "$cert_path" ]; then
find "$cert_path" -name "*.crt" -o -name "cert.pem" | while read -r cert_file; do
if [ -f "$cert_file" ]; then
local expiry_date=$(openssl x509 -in "$cert_file" -noout -enddate 2>/dev/null | cut -d= -f2)
if [ -n "$expiry_date" ]; then
local expiry_timestamp=$(date -d "$expiry_date" +%s 2>/dev/null)
local current_timestamp=$(date +%s)
local days_until_expiry=$(( (expiry_timestamp - current_timestamp) / 86400 ))
log INFO "SSL certificate $cert_file expires in $days_until_expiry days"
if [ "$days_until_expiry" -lt 30 ]; then
expiring_certs+=("$(basename "$cert_file"):$days_until_expiry")
fi
echo "ssl_cert_$(basename "$cert_file" .crt | tr '.' '_')_days:$days_until_expiry"
fi
fi
done
fi
done
if [ ${#expiring_certs[@]} -gt 0 ]; then
alert "WARNING" "SSL certificates expiring soon: ${expiring_certs[*]}" "ssl_expiring"
fi
}
# Check log files for errors
check_log_errors() {
local log_files=("/var/log/syslog" "/var/log/auth.log" "/var/log/kern.log")
local error_threshold=50
local time_window="1 hour ago"
for log_file in "${log_files[@]}"; do
if [ -f "$log_file" ]; then
local error_count=$(find "$log_file" -newermt "$time_window" -exec grep -i "error\|critical\|fatal" {} \; 2>/dev/null | wc -l)
log INFO "Errors in $(basename "$log_file"): $error_count (last hour)"
if [ "$error_count" -gt "$error_threshold" ]; then
alert "WARNING" "High error count in $log_file: $error_count errors in last hour" "log_errors_$(basename "$log_file")"
fi
echo "log_errors_$(basename "$log_file"):$error_count"
fi
done
}
# Generate metrics for monitoring systems
generate_metrics() {
local metrics_file="/tmp/health_metrics.$$"
{
check_cpu
check_memory
check_disk
check_load
check_services
check_network
check_ssl_certificates
check_log_errors
} > "$metrics_file"
# Output in Prometheus format if requested
if [ "$OUTPUT_FORMAT" = "prometheus" ]; then
while IFS=':' read -r metric value; do
echo "system_${metric} ${value}"
done < "$metrics_file"
else
cat "$metrics_file"
fi
rm -f "$metrics_file"
}
# Main monitoring loop
monitor_loop() {
log INFO "Starting health monitoring loop (interval: ${CHECK_INTERVAL}s)"
while true; do
log INFO "Running health checks..."
generate_metrics > "/tmp/current_metrics.txt"
log INFO "Health check completed"
sleep "$CHECK_INTERVAL"
done
}
# One-time health check
health_check() {
log INFO "Running one-time health check"
generate_metrics
}
# Show system status dashboard
show_dashboard() {
clear
echo "System Health Dashboard - $(hostname)"
echo "====================================="
echo "Last updated: $(date)"
echo
# System info
echo "System Information:"
echo "=================="
echo "Uptime: $(uptime -p)"
echo "Kernel: $(uname -r)"
echo "Architecture: $(uname -m)"
echo
# Current metrics
echo "Current Metrics:"
echo "==============="
generate_metrics | while IFS=':' read -r metric value; do
printf "%-30s: %s\n" "$metric" "$value"
done
echo
echo "Recent Alerts:"
echo "============="
if [ -f "$ALERT_LOG" ]; then
tail -5 "$ALERT_LOG"
else
echo "No recent alerts"
fi
}
# Load configuration
load_config() {
if [ -f "$MONITOR_CONFIG" ]; then
source "$MONITOR_CONFIG"
log INFO "Configuration loaded from $MONITOR_CONFIG"
fi
}
# Main execution
load_config
case "${1:-check}" in
check)
health_check
;;
monitor)
monitor_loop
;;
dashboard)
show_dashboard
;;
metrics)
OUTPUT_FORMAT="prometheus" generate_metrics
;;
config)
cat > "$MONITOR_CONFIG" << EOF
# Health Monitor Configuration
# Generated on $(date)
# Check intervals (seconds)
CHECK_INTERVAL=300
# Thresholds
CPU_THRESHOLD=80
MEMORY_THRESHOLD=85
DISK_THRESHOLD=90
LOAD_THRESHOLD=2.0
INODE_THRESHOLD=90
# Alert settings
ALERT_EMAIL=""
ALERT_WEBHOOK=""
ALERT_COOLDOWN=3600
# Critical services to monitor
CRITICAL_SERVICES=("ssh" "nginx" "apache2" "mysql" "postgresql")
# Log files
LOG_FILE="/var/log/health_monitor.log"
ALERT_LOG="/var/log/health_alerts.log"
EOF
echo "Configuration template created: $MONITOR_CONFIG"
;;
*)
echo "Usage: $0 {check|monitor|dashboard|metrics|config}"
echo "Commands:"
echo " check - Run one-time health check"
echo " monitor - Start continuous monitoring"
echo " dashboard - Show system status dashboard"
echo " metrics - Output metrics in Prometheus format"
echo " config - Generate configuration template"
exit 1
;;
esac
```##
Development Automation
### CI/CD Pipeline Script
```bash
#!/bin/bash
# cicd_pipeline.sh - Continuous Integration/Continuous Deployment pipeline
PROJECT_NAME="${PROJECT_NAME:-myapp}"
GIT_REPO="${GIT_REPO:-}"
BUILD_DIR="${BUILD_DIR:-./build}"
DEPLOY_DIR="${DEPLOY_DIR:-/var/www/$PROJECT_NAME}"
LOG_FILE="${LOG_FILE:-./pipeline.log}"
# Pipeline stages
STAGES=("checkout" "test" "build" "deploy" "notify")
# Configuration
SLACK_WEBHOOK="${SLACK_WEBHOOK:-}"
EMAIL_NOTIFICATIONS="${EMAIL_NOTIFICATIONS:-}"
DOCKER_REGISTRY="${DOCKER_REGISTRY:-}"
KUBERNETES_NAMESPACE="${KUBERNETES_NAMESPACE:-default}"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
# Logging function
log() {
local level="$1"
shift
local message="$*"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] [$level] $message" | tee -a "$LOG_FILE"
case "$level" in
ERROR) echo -e "${RED}[$timestamp] ERROR: $message${NC}" ;;
WARN) echo -e "${YELLOW}[$timestamp] WARN: $message${NC}" ;;
INFO) echo -e "${GREEN}[$timestamp] INFO: $message${NC}" ;;
DEBUG) echo -e "${BLUE}[$timestamp] DEBUG: $message${NC}" ;;
esac
}
# Send notification
send_notification() {
local status="$1"
local stage="$2"
local message="$3"
# Slack notification
if [ -n "$SLACK_WEBHOOK" ] && command -v curl >/dev/null 2>&1; then
local color="good"
[ "$status" != "SUCCESS" ] && color="danger"
curl -X POST -H 'Content-type: application/json' \
--data "{\"attachments\":[{\"color\":\"$color\",\"title\":\"Pipeline $status\",\"fields\":[{\"title\":\"Project\",\"value\":\"$PROJECT_NAME\",\"short\":true},{\"title\":\"Stage\",\"value\":\"$stage\",\"short\":true},{\"title\":\"Message\",\"value\":\"$message\",\"short\":false}]}]}" \
"$SLACK_WEBHOOK" >/dev/null 2>&1
fi
# Email notification
if [ -n "$EMAIL_NOTIFICATIONS" ] && command -v mail >/dev/null 2>&1; then
{
echo "Pipeline Status: $status"
echo "Project: $PROJECT_NAME"
echo "Stage: $stage"
echo "Time: $(date)"
echo
echo "Message: $message"
echo
echo "Recent log entries:"
tail -20 "$LOG_FILE"
} | mail -s "Pipeline $status - $PROJECT_NAME" "$EMAIL_NOTIFICATIONS"
fi
}
# Checkout stage
stage_checkout() {
log INFO "Starting checkout stage"
if [ -z "$GIT_REPO" ]; then
log ERROR "GIT_REPO not specified"
return 1
fi
# Clean previous build
rm -rf "$BUILD_DIR"
mkdir -p "$BUILD_DIR"
# Clone repository
if git clone "$GIT_REPO" "$BUILD_DIR"; then
log INFO "Repository cloned successfully"
# Get commit information
cd "$BUILD_DIR"
local commit_hash=$(git rev-parse HEAD)
local commit_message=$(git log -1 --pretty=%B)
local author=$(git log -1 --pretty=%an)
log INFO "Commit: $commit_hash"
log INFO "Author: $author"
log INFO "Message: $commit_message"
return 0
else
log ERROR "Failed to clone repository"
return 1
fi
}
# Test stage
stage_test() {
log INFO "Starting test stage"
cd "$BUILD_DIR"
# Detect project type and run appropriate tests
if [ -f "package.json" ]; then
# Node.js project
log INFO "Detected Node.js project"
if command -v npm >/dev/null 2>&1; then
log INFO "Installing dependencies..."
if npm install; then
log INFO "Dependencies installed"
else
log ERROR "Failed to install dependencies"
return 1
fi
log INFO "Running tests..."
if npm test; then
log INFO "Tests passed"
else
log ERROR "Tests failed"
return 1
fi
else
log ERROR "npm not found"
return 1
fi
elif [ -f "requirements.txt" ] || [ -f "setup.py" ]; then
# Python project
log INFO "Detected Python project"
if command -v python3 >/dev/null 2>&1; then
# Create virtual environment
python3 -m venv venv
source venv/bin/activate
# Install dependencies
if [ -f "requirements.txt" ]; then
log INFO "Installing Python dependencies..."
if pip install -r requirements.txt; then
log INFO "Dependencies installed"
else
log ERROR "Failed to install dependencies"
return 1
fi
fi
# Run tests
if [ -f "pytest.ini" ] || [ -d "tests" ]; then
log INFO "Running Python tests..."
if python -m pytest; then
log INFO "Tests passed"
else
log ERROR "Tests failed"
return 1
fi
fi
else
log ERROR "python3 not found"
return 1
fi
elif [ -f "Makefile" ]; then
# Make-based project
log INFO "Detected Make-based project"
if make test; then
log INFO "Tests passed"
else
log ERROR "Tests failed"
return 1
fi
else
log WARN "No recognized test framework found, skipping tests"
fi
return 0
}
# Build stage
stage_build() {
log INFO "Starting build stage"
cd "$BUILD_DIR"
# Detect project type and build
if [ -f "Dockerfile" ]; then
# Docker build
log INFO "Building Docker image"
local image_tag="$PROJECT_NAME:$(git rev-parse --short HEAD)"
if docker build -t "$image_tag" .; then
log INFO "Docker image built: $image_tag"
# Push to registry if configured
if [ -n "$DOCKER_REGISTRY" ]; then
local registry_image="$DOCKER_REGISTRY/$image_tag"
if docker tag "$image_tag" "$registry_image"; then
log INFO "Tagged image for registry: $registry_image"
if docker push "$registry_image"; then
log INFO "Image pushed to registry"
else
log ERROR "Failed to push image to registry"
return 1
fi
fi
fi
else
log ERROR "Docker build failed"
return 1
fi
elif [ -f "package.json" ]; then
# Node.js build
log INFO "Building Node.js application"
if npm run build 2>/dev/null || npm run compile 2>/dev/null; then
log INFO "Build completed"
else
log WARN "No build script found or build failed"
fi
elif [ -f "Makefile" ]; then
# Make build
log INFO "Building with Make"
if make; then
log INFO "Build completed"
else
log ERROR "Build failed"
return 1
fi
else
log INFO "No build process detected, copying files"
# Simple file copy
mkdir -p dist
cp -r . dist/
log INFO "Files copied to dist/"
fi
return 0
}
# Deploy stage
stage_deploy() {
log INFO "Starting deploy stage"
cd "$BUILD_DIR"
# Backup current deployment
if [ -d "$DEPLOY_DIR" ]; then
local backup_dir="${DEPLOY_DIR}.backup.$(date +%Y%m%d_%H%M%S)"
log INFO "Creating backup: $backup_dir"
cp -r "$DEPLOY_DIR" "$backup_dir"
fi
# Deploy based on project type
if [ -f "Dockerfile" ] && command -v kubectl >/dev/null 2>&1; then
# Kubernetes deployment
log INFO "Deploying to Kubernetes"
local image_tag="$PROJECT_NAME:$(git rev-parse --short HEAD)"
# Update deployment
if kubectl set image deployment/$PROJECT_NAME container=$DOCKER_REGISTRY/$image_tag -n $KUBERNETES_NAMESPACE; then
log INFO "Kubernetes deployment updated"
# Wait for rollout
if kubectl rollout status deployment/$PROJECT_NAME -n $KUBERNETES_NAMESPACE; then
log INFO "Deployment rollout completed"
else
log ERROR "Deployment rollout failed"
return 1
fi
else
log ERROR "Failed to update Kubernetes deployment"
return 1
fi
else
# File-based deployment
log INFO "Deploying files to $DEPLOY_DIR"
# Create deploy directory
mkdir -p "$DEPLOY_DIR"
# Copy files
if [ -d "dist" ]; then
cp -r dist/* "$DEPLOY_DIR/"
else
cp -r . "$DEPLOY_DIR/"
fi
# Set permissions
chown -R www-data:www-data "$DEPLOY_DIR" 2>/dev/null || true
chmod -R 755 "$DEPLOY_DIR"
# Restart services if needed
if systemctl is-active --quiet nginx; then
systemctl reload nginx
log INFO "Nginx reloaded"
fi
if systemctl is-active --quiet apache2; then
systemctl reload apache2
log INFO "Apache reloaded"
fi
log INFO "Deployment completed"
fi
return 0
}
# Notify stage
stage_notify() {
log INFO "Starting notify stage"
local commit_hash=$(cd "$BUILD_DIR" && git rev-parse --short HEAD)
local commit_message=$(cd "$BUILD_DIR" && git log -1 --pretty=%B | head -1)
send_notification "SUCCESS" "deploy" "Deployment completed successfully. Commit: $commit_hash - $commit_message"
return 0
}
# Run pipeline
run_pipeline() {
local start_time=$(date +%s)
log INFO "Starting CI/CD pipeline for $PROJECT_NAME"
for stage in "${STAGES[@]}"; do
log INFO "Running stage: $stage"
local stage_start=$(date +%s)
if "stage_$stage"; then
local stage_end=$(date +%s)
local stage_duration=$((stage_end - stage_start))
log INFO "Stage $stage completed in ${stage_duration}s"
else
local stage_end=$(date +%s)
local stage_duration=$((stage_end - stage_start))
log ERROR "Stage $stage failed after ${stage_duration}s"
send_notification "FAILED" "$stage" "Pipeline failed at stage: $stage"
exit 1
fi
done
local end_time=$(date +%s)
local total_duration=$((end_time - start_time))
log INFO "Pipeline completed successfully in ${total_duration}s"
}
# Rollback function
rollback() {
local backup_dir="$1"
if [ -z "$backup_dir" ]; then
# Find latest backup
backup_dir=$(ls -td "${DEPLOY_DIR}.backup."* 2>/dev/null | head -1)
fi
if [ -d "$backup_dir" ]; then
log INFO "Rolling back to: $backup_dir"
rm -rf "$DEPLOY_DIR"
cp -r "$backup_dir" "$DEPLOY_DIR"
log INFO "Rollback completed"
else
log ERROR "Backup directory not found: $backup_dir"
exit 1
fi
}
# Show pipeline status
show_status() {
echo "CI/CD Pipeline Status"
echo "===================="
echo "Project: $PROJECT_NAME"
echo "Build Directory: $BUILD_DIR"
echo "Deploy Directory: $DEPLOY_DIR"
echo
if [ -f "$LOG_FILE" ]; then
echo "Recent Activity:"
echo "==============="
tail -20 "$LOG_FILE"
fi
echo
echo "Available Backups:"
echo "=================="
ls -la "${DEPLOY_DIR}.backup."* 2>/dev/null || echo "No backups found"
}
# Main execution
case "${1:-run}" in
run)
run_pipeline
;;
checkout)
stage_checkout
;;
test)
stage_test
;;
build)
stage_build
;;
deploy)
stage_deploy
;;
rollback)
rollback "$2"
;;
status)
show_status
;;
*)
echo "Usage: $0 {run|checkout|test|build|deploy|rollback|status}"
echo "Environment variables:"
echo " PROJECT_NAME - Project name"
echo " GIT_REPO - Git repository URL"
echo " BUILD_DIR - Build directory"
echo " DEPLOY_DIR - Deployment directory"
echo " SLACK_WEBHOOK - Slack webhook URL"
echo " EMAIL_NOTIFICATIONS - Email for notifications"
echo " DOCKER_REGISTRY - Docker registry URL"
echo " KUBERNETES_NAMESPACE - Kubernetes namespace"
exit 1
;;
esacEnvironment Provisioning Script
#!/bin/bash
# provision_environment.sh - Automated environment provisioning
ENVIRONMENT="${1:-development}"
CONFIG_DIR="${CONFIG_DIR:-./config}"
ANSIBLE_PLAYBOOK="${ANSIBLE_PLAYBOOK:-}"
TERRAFORM_DIR="${TERRAFORM_DIR:-./terraform}"
# Environment configurations
declare -A ENV_CONFIGS=(
["development"]="dev"
["staging"]="stage"
["production"]="prod"
)
# Required tools
REQUIRED_TOOLS=("ansible" "terraform" "kubectl" "docker")
# Logging function
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*"
}
# Check prerequisites
check_prerequisites() {
log "Checking prerequisites..."
local missing_tools=()
for tool in "${REQUIRED_TOOLS[@]}"; do
if ! command -v "$tool" >/dev/null 2>&1; then
missing_tools+=("$tool")
fi
done
if [ ${#missing_tools[@]} -gt 0 ]; then
log "ERROR: Missing required tools: ${missing_tools[*]}"
log "Please install the missing tools and try again."
exit 1
fi
log "All prerequisites satisfied"
}
# Load environment configuration
load_environment_config() {
local env="$1"
local config_file="$CONFIG_DIR/${env}.conf"
if [ -f "$config_file" ]; then
source "$config_file"
log "Loaded configuration for environment: $env"
else
log "WARNING: Configuration file not found: $config_file"
log "Using default configuration"
fi
}
# Provision infrastructure with Terraform
provision_infrastructure() {
local env="$1"
if [ ! -d "$TERRAFORM_DIR" ]; then
log "Terraform directory not found: $TERRAFORM_DIR"
return 1
fi
log "Provisioning infrastructure for environment: $env"
cd "$TERRAFORM_DIR"
# Initialize Terraform
if terraform init; then
log "Terraform initialized"
else
log "ERROR: Terraform initialization failed"
return 1
fi
# Select workspace
if terraform workspace select "$env" 2>/dev/null || terraform workspace new "$env"; then
log "Terraform workspace: $env"
else
log "ERROR: Failed to select/create workspace: $env"
return 1
fi
# Plan infrastructure changes
log "Planning infrastructure changes..."
if terraform plan -var-file="$CONFIG_DIR/${env}.tfvars" -out="$env.tfplan"; then
log "Terraform plan completed"
else
log "ERROR: Terraform planning failed"
return 1
fi
# Apply infrastructure changes
log "Applying infrastructure changes..."
if terraform apply "$env.tfplan"; then
log "Infrastructure provisioned successfully"
# Save outputs
terraform output -json > "$CONFIG_DIR/${env}_outputs.json"
log "Terraform outputs saved"
else
log "ERROR: Terraform apply failed"
return 1
fi
cd - >/dev/null
}
# Configure servers with Ansible
configure_servers() {
local env="$1"
local playbook="${ANSIBLE_PLAYBOOK:-site.yml}"
if [ ! -f "$playbook" ]; then
log "Ansible playbook not found: $playbook"
return 1
fi
log "Configuring servers for environment: $env"
# Run Ansible playbook
if ansible-playbook -i "inventory/${env}" "$playbook" --extra-vars "environment=$env"; then
log "Server configuration completed"
else
log "ERROR: Ansible playbook execution failed"
return 1
fi
}
# Deploy applications
deploy_applications() {
local env="$1"
log "Deploying applications for environment: $env"
# Deploy to Kubernetes if configured
if [ -n "$KUBERNETES_NAMESPACE" ] && command -v kubectl >/dev/null 2>&1; then
log "Deploying to Kubernetes namespace: $KUBERNETES_NAMESPACE"
# Apply Kubernetes manifests
if [ -d "k8s/$env" ]; then
kubectl apply -f "k8s/$env/" -n "$KUBERNETES_NAMESPACE"
log "Kubernetes manifests applied"
fi
# Wait for deployments to be ready
kubectl wait --for=condition=available --timeout=300s deployment --all -n "$KUBERNETES_NAMESPACE"
log "All deployments are ready"
fi
# Deploy with Docker Compose if configured
if [ -f "docker-compose.${env}.yml" ]; then
log "Deploying with Docker Compose"
docker-compose -f "docker-compose.${env}.yml" up -d
log "Docker Compose deployment completed"
fi
}
# Setup monitoring and logging
setup_monitoring() {
local env="$1"
log "Setting up monitoring for environment: $env"
# Deploy monitoring stack
if [ -d "monitoring/$env" ]; then
kubectl apply -f "monitoring/$env/" -n "monitoring-$env" 2>/dev/null || true
log "Monitoring stack deployed"
fi
# Configure log aggregation
if [ -f "logging/fluentd-${env}.yml" ]; then
kubectl apply -f "logging/fluentd-${env}.yml" -n "logging-$env" 2>/dev/null || true
log "Log aggregation configured"
fi
}
# Run health checks
run_health_checks() {
local env="$1"
log "Running health checks for environment: $env"
# Load health check endpoints from configuration
local health_endpoints=()
if [ -f "$CONFIG_DIR/${env}_health_checks.txt" ]; then
while IFS= read -r endpoint; do
health_endpoints+=("$endpoint")
done < "$CONFIG_DIR/${env}_health_checks.txt"
fi
# Default health checks
if [ ${#health_endpoints[@]} -eq 0 ]; then
health_endpoints=("http://localhost:8080/health" "http://localhost:3000/health")
fi
local failed_checks=0
for endpoint in "${health_endpoints[@]}"; do
log "Checking health endpoint: $endpoint"
if curl -f -s "$endpoint" >/dev/null; then
log "✓ Health check passed: $endpoint"
else
log "✗ Health check failed: $endpoint"
((failed_checks++))
fi
done
if [ $failed_checks -eq 0 ]; then
log "All health checks passed"
return 0
else
log "ERROR: $failed_checks health checks failed"
return 1
fi
}
# Generate environment documentation
generate_documentation() {
local env="$1"
local doc_file="docs/${env}_environment.md"
log "Generating documentation for environment: $env"
mkdir -p docs
{
echo "# $env Environment Documentation"
echo
echo "Generated on: $(date)"
echo
echo "## Infrastructure"
if [ -f "$CONFIG_DIR/${env}_outputs.json" ]; then
echo "### Terraform Outputs"
echo '```json'
cat "$CONFIG_DIR/${env}_outputs.json"
echo '```'
fi
echo
echo "## Services"
if command -v kubectl >/dev/null 2>&1; then
echo "### Kubernetes Services"
echo '```'
kubectl get services -n "$KUBERNETES_NAMESPACE" 2>/dev/null || echo "No Kubernetes services found"
echo '```'
fi
echo
echo "## Configuration"
if [ -f "$CONFIG_DIR/${env}.conf" ]; then
echo "### Environment Configuration"
echo '```bash'
cat "$CONFIG_DIR/${env}.conf"
echo '```'
fi
} > "$doc_file"
log "Documentation generated: $doc_file"
}
# Cleanup environment
cleanup_environment() {
local env="$1"
log "Cleaning up environment: $env"
# Confirm cleanup
read -p "Are you sure you want to destroy the $env environment? (yes/no): " confirm
if [ "$confirm" != "yes" ]; then
log "Cleanup cancelled"
return 0
fi
# Destroy Kubernetes resources
if [ -d "k8s/$env" ] && command -v kubectl >/dev/null 2>&1; then
kubectl delete -f "k8s/$env/" -n "$KUBERNETES_NAMESPACE" 2>/dev/null || true
log "Kubernetes resources deleted"
fi
# Destroy Docker Compose services
if [ -f "docker-compose.${env}.yml" ]; then
docker-compose -f "docker-compose.${env}.yml" down -v
log "Docker Compose services stopped"
fi
# Destroy Terraform infrastructure
if [ -d "$TERRAFORM_DIR" ]; then
cd "$TERRAFORM_DIR"
terraform workspace select "$env"
terraform destroy -var-file="$CONFIG_DIR/${env}.tfvars" -auto-approve
terraform workspace select default
terraform workspace delete "$env"
cd - >/dev/null
log "Infrastructure destroyed"
fi
log "Environment cleanup completed"
}
# Show environment status
show_status() {
local env="$1"
echo "Environment Status: $env"
echo "======================="
echo
# Terraform status
if [ -d "$TERRAFORM_DIR" ]; then
echo "Infrastructure Status:"
cd "$TERRAFORM_DIR"
terraform workspace select "$env" 2>/dev/null && terraform show -json | jq -r '.values.root_module.resources[].address' 2>/dev/null || echo "No infrastructure found"
cd - >/dev/null
echo
fi
# Kubernetes status
if command -v kubectl >/dev/null 2>&1; then
echo "Kubernetes Status:"
kubectl get all -n "$KUBERNETES_NAMESPACE" 2>/dev/null || echo "No Kubernetes resources found"
echo
fi
# Docker status
if [ -f "docker-compose.${env}.yml" ]; then
echo "Docker Compose Status:"
docker-compose -f "docker-compose.${env}.yml" ps
fi
}
# Main provisioning process
provision_environment() {
local env="$1"
log "Starting environment provisioning: $env"
check_prerequisites
load_environment_config "$env"
# Provision infrastructure
if provision_infrastructure "$env"; then
log "Infrastructure provisioning completed"
else
log "ERROR: Infrastructure provisioning failed"
exit 1
fi
# Configure servers
if configure_servers "$env"; then
log "Server configuration completed"
else
log "ERROR: Server configuration failed"
exit 1
fi
# Deploy applications
if deploy_applications "$env"; then
log "Application deployment completed"
else
log "ERROR: Application deployment failed"
exit 1
fi
# Setup monitoring
setup_monitoring "$env"
# Run health checks
if run_health_checks "$env"; then
log "Health checks passed"
else
log "WARNING: Some health checks failed"
fi
# Generate documentation
generate_documentation "$env"
log "Environment provisioning completed successfully: $env"
}
# Main execution
case "${2:-provision}" in
provision)
provision_environment "$ENVIRONMENT"
;;
infrastructure)
check_prerequisites
load_environment_config "$ENVIRONMENT"
provision_infrastructure "$ENVIRONMENT"
;;
configure)
check_prerequisites
load_environment_config "$ENVIRONMENT"
configure_servers "$ENVIRONMENT"
;;
deploy)
check_prerequisites
load_environment_config "$ENVIRONMENT"
deploy_applications "$ENVIRONMENT"
;;
health)
load_environment_config "$ENVIRONMENT"
run_health_checks "$ENVIRONMENT"
;;
status)
show_status "$ENVIRONMENT"
;;
cleanup)
cleanup_environment "$ENVIRONMENT"
;;
docs)
generate_documentation "$ENVIRONMENT"
;;
*)
echo "Usage: $0 <environment> {provision|infrastructure|configure|deploy|health|status|cleanup|docs}"
echo
echo "Environments: ${!ENV_CONFIGS[*]}"
echo
echo "Commands:"
echo " provision - Full environment provisioning"
echo " infrastructure - Provision infrastructure only"
echo " configure - Configure servers only"
echo " deploy - Deploy applications only"
echo " health - Run health checks"
echo " status - Show environment status"
echo " cleanup - Destroy environment"
echo " docs - Generate documentation"
exit 1
;;
esacWeb Scraping and API Automation
Web Scraper
#!/bin/bash
# web_scraper.sh - Web scraping and data extraction tool
URL="$1"
OUTPUT_DIR="${OUTPUT_DIR:-./scraped_data}"
USER_AGENT="${USER_AGENT:-Mozilla/5.0 (Linux; Bash Web Scraper)}"
DELAY="${DELAY:-1}"
MAX_PAGES="${MAX_PAGES:-10}"
# Create output directory
mkdir -p "$OUTPUT_DIR"
# Logging function
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $*" | tee -a "$OUTPUT_DIR/scraper.log"
}
# Download page with retry logic
download_page() {
local url="$1"
local output_file="$2"
local max_retries=3
local retry_count=0
while [ $retry_count -lt $max_retries ]; do
if curl -s -L -A "$USER_AGENT" \
-H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
-H "Accept-Language: en-US,en;q=0.5" \
-H "Accept-Encoding: gzip, deflate" \
-H "Connection: keep-alive" \
--compressed \
"$url" > "$output_file"; then
log "Downloaded: $url"
return 0
else
((retry_count++))
log "Retry $retry_count/$max_retries for: $url"
sleep $((retry_count * 2))
fi
done
log "Failed to download after $max_retries retries: $url"
return 1
}
# Extract links from HTML
extract_links() {
local html_file="$1"
local base_url="$2"
# Extract all href attributes
grep -oP 'href="\K[^"]*' "$html_file" | while read -r link; do
# Convert relative URLs to absolute
if [[ $link =~ ^https?:// ]]; then
echo "$link"
elif [[ $link =~ ^/ ]]; then
echo "${base_url%/}$link"
elif [[ $link =~ ^[^/] ]]; then
echo "${base_url%/}/$link"
fi
done | sort -u
}
# Extract text content
extract_text() {
local html_file="$1"
local output_file="$2"
# Remove HTML tags and extract text
sed 's/<[^>]*>//g' "$html_file" | \
sed 's/ / /g; s/&/\&/g; s/</</g; s/>/>/g; s/"/"/g' | \
sed '/^[[:space:]]*$/d' > "$output_file"
log "Extracted text to: $output_file"
}
# Extract specific data using patterns
extract_data() {
local html_file="$1"
local pattern="$2"
local output_file="$3"
grep -oP "$pattern" "$html_file" > "$output_file"
local count=$(wc -l < "$output_file")
log "Extracted $count items matching pattern: $pattern"
}
# Extract emails
extract_emails() {
local html_file="$1"
local output_file="$2"
grep -oE '[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' "$html_file" | sort -u > "$output_file"
local count=$(wc -l < "$output_file")
log "Extracted $count unique email addresses"
}
# Extract phone numbers
extract_phones() {
local html_file="$1"
local output_file="$2"
# Various phone number patterns
grep -oE '(\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}' "$html_file" | sort -u > "$output_file"
local count=$(wc -l < "$output_file")
log "Extracted $count phone numbers"
}
# Extract images
extract_images() {
local html_file="$1"
local base_url="$2"
local output_file="$3"
grep -oP 'src="\K[^"]*\.(jpg|jpeg|png|gif|webp)' "$html_file" | while read -r img_src; do
# Convert relative URLs to absolute
if [[ $img_src =~ ^https?:// ]]; then
echo "$img_src"
elif [[ $img_src =~ ^/ ]]; then
echo "${base_url%/}$img_src"
elif [[ $img_src =~ ^[^/] ]]; then
echo "${base_url%/}/$img_src"
fi
done | sort -u > "$output_file"
local count=$(wc -l < "$output_file")
log "Extracted $count image URLs"
}
# Download images
download_images() {
local image_list="$1"
local download_dir="$OUTPUT_DIR/images"
mkdir -p "$download_dir"
while IFS= read -r img_url; do
local filename=$(basename "$img_url" | cut -d'?' -f1)
local output_path="$download_dir/$filename"
if [ ! -f "$output_path" ]; then
if curl -s -L -A "$USER_AGENT" "$img_url" -o "$output_path"; then
log "Downloaded image: $filename"
else
log "Failed to download image: $img_url"
fi
sleep "$DELAY"
fi
done < "$image_list"
}
# Generate sitemap
generate_sitemap() {
local links_file="$1"
local sitemap_file="$OUTPUT_DIR/sitemap.xml"
{
echo '<?xml version="1.0" encoding="UTF-8"?>'
echo '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
while IFS= read -r url; do
echo " <url>"
echo " <loc>$url</loc>"
echo " <lastmod>$(date -I)</lastmod>"
echo " </url>"
done < "$links_file"
echo '</urlset>'
} > "$sitemap_file"
log "Generated sitemap: $sitemap_file"
}
# Analyze page structure
analyze_structure() {
local html_file="$1"
local analysis_file="$OUTPUT_DIR/structure_analysis.txt"
{
echo "Page Structure Analysis"
echo "======================"
echo "Generated: $(date)"
echo
echo "HTML Tags Count:"
echo "==============="
grep -o '<[^>]*>' "$html_file" | sed 's/<\([^ >]*\).*/\1/' | sort | uniq -c | sort -nr
echo
echo "Meta Tags:"
echo "=========="
grep -i '<meta' "$html_file" || echo "No meta tags found"
echo
echo "Title:"
echo "======"
grep -oP '<title>\K[^<]*' "$html_file" || echo "No title found"
echo
echo "Headings:"
echo "========="
grep -oP '<h[1-6][^>]*>\K[^<]*' "$html_file" || echo "No headings found"
echo
echo "Forms:"
echo "======"
grep -c '<form' "$html_file" | xargs echo "Form count:"
echo
echo "Scripts:"
echo "========"
grep -c '<script' "$html_file" | xargs echo "Script count:"
echo
echo "Stylesheets:"
echo "============"
grep -c '<link.*stylesheet' "$html_file" | xargs echo "Stylesheet count:"
} > "$analysis_file"
log "Structure analysis saved: $analysis_file"
}
# Main scraping function
scrape_website() {
local start_url="$1"
local base_url=$(echo "$start_url" | sed 's|^\(https\?://[^/]*\).*|\1|')
log "Starting web scraping: $start_url"
log "Base URL: $base_url"
local page_count=0
local visited_file="$OUTPUT_DIR/visited_urls.txt"
local queue_file="$OUTPUT_DIR/url_queue.txt"
# Initialize queue
echo "$start_url" > "$queue_file"
touch "$visited_file"
while [ $page_count -lt $MAX_PAGES ] && [ -s "$queue_file" ]; do
# Get next URL from queue
local current_url=$(head -1 "$queue_file")
sed -i '1d' "$queue_file"
# Skip if already visited
if grep -Fxq "$current_url" "$visited_file"; then
continue
fi
# Mark as visited
echo "$current_url" >> "$visited_file"
((page_count++))
log "Processing page $page_count/$MAX_PAGES: $current_url"
# Download page
local html_file="$OUTPUT_DIR/page_${page_count}.html"
if download_page "$current_url" "$html_file"; then
# Extract data
extract_text "$html_file" "$OUTPUT_DIR/page_${page_count}.txt"
extract_emails "$html_file" "$OUTPUT_DIR/page_${page_count}_emails.txt"
extract_phones "$html_file" "$OUTPUT_DIR/page_${page_count}_phones.txt"
extract_images "$html_file" "$base_url" "$OUTPUT_DIR/page_${page_count}_images.txt"
# Extract links for further crawling
local links_file="$OUTPUT_DIR/page_${page_count}_links.txt"
extract_links "$html_file" "$base_url" > "$links_file"
# Add new links to queue (same domain only)
while IFS= read -r link; do
if [[ $link =~ ^$base_url ]] && ! grep -Fxq "$link" "$visited_file"; then
echo "$link" >> "$queue_file"
fi
done < "$links_file"
# Analyze page structure for first page
if [ $page_count -eq 1 ]; then
analyze_structure "$html_file"
fi
fi
# Respect rate limiting
sleep "$DELAY"
done
log "Scraping completed. Processed $page_count pages."
# Generate consolidated reports
generate_reports
}
# Generate consolidated reports
generate_reports() {
log "Generating consolidated reports..."
# Consolidate all emails
cat "$OUTPUT_DIR"/page_*_emails.txt 2>/dev/null | sort -u > "$OUTPUT_DIR/all_emails.txt"
local email_count=$(wc -l < "$OUTPUT_DIR/all_emails.txt" 2>/dev/null || echo 0)
log "Total unique emails found: $email_count"
# Consolidate all phone numbers
cat "$OUTPUT_DIR"/page_*_phones.txt 2>/dev/null | sort -u > "$OUTPUT_DIR/all_phones.txt"
local phone_count=$(wc -l < "$OUTPUT_DIR/all_phones.txt" 2>/dev/null || echo 0)
log "Total unique phone numbers found: $phone_count"
# Consolidate all images
cat "$OUTPUT_DIR"/page_*_images.txt 2>/dev/null | sort -u > "$OUTPUT_DIR/all_images.txt"
local image_count=$(wc -l < "$OUTPUT_DIR/all_images.txt" 2>/dev/null || echo 0)
log "Total unique images found: $image_count"
# Consolidate all links
cat "$OUTPUT_DIR"/page_*_links.txt 2>/dev/null | sort -u > "$OUTPUT_DIR/all_links.txt"
local link_count=$(wc -l < "$OUTPUT_DIR/all_links.txt" 2>/dev/null || echo 0)
log "Total unique links found: $link_count"
# Generate sitemap
if [ -f "$OUTPUT_DIR/all_links.txt" ]; then
generate_sitemap "$OUTPUT_DIR/all_links.txt"
fi
# Generate summary report
{
echo "Web Scraping Summary Report"
echo "=========================="
echo "Generated: $(date)"
echo "Target URL: $URL"
echo "Pages processed: $(ls "$OUTPUT_DIR"/page_*.html 2>/dev/null | wc -l)"
echo "Unique emails: $email_count"
echo "Unique phones: $phone_count"
echo "Unique images: $image_count"
echo "Unique links: $link_count"
echo
echo "Output directory: $OUTPUT_DIR"
echo
echo "Files generated:"
ls -la "$OUTPUT_DIR"
} > "$OUTPUT_DIR/summary_report.txt"
log "Summary report generated: $OUTPUT_DIR/summary_report.txt"
}
# Show usage
show_usage() {
cat << EOF
Web Scraper - Extract data from websites
Usage: $0 <url> [options]
Options:
-d, --delay SECONDS Delay between requests (default: 1)
-m, --max-pages NUM Maximum pages to scrape (default: 10)
-o, --output DIR Output directory (default: ./scraped_data)
-u, --user-agent STR User agent string
-i, --images Download images
-h, --help Show this help
Environment Variables:
OUTPUT_DIR Output directory
USER_AGENT User agent string
DELAY Delay between requests
MAX_PAGES Maximum pages to scrape
Examples:
$0 https://example.com
$0 https://example.com -d 2 -m 5
$0 https://example.com --images
EOF
}
# Parse command line arguments
DOWNLOAD_IMAGES=false
while [[ $# -gt 0 ]]; do
case $1 in
-d|--delay)
DELAY="$2"
shift 2
;;
-m|--max-pages)
MAX_PAGES="$2"
shift 2
;;
-o|--output)
OUTPUT_DIR="$2"
shift 2
;;
-u|--user-agent)
USER_AGENT="$2"
shift 2
;;
-i|--images)
DOWNLOAD_IMAGES=true
shift
;;
-h|--help)
show_usage
exit 0
;;
-*)
echo "Unknown option: $1"
show_usage
exit 1
;;
*)
if [ -z "$URL" ]; then
URL="$1"
fi
shift
;;
esac
done
# Main execution
if [ -z "$URL" ]; then
echo "Error: URL is required"
show_usage
exit 1
fi
# Validate URL
if ! [[ $URL =~ ^https?:// ]]; then
echo "Error: Invalid URL format. Must start with http:// or https://"
exit 1
fi
# Check dependencies
if ! command -v curl >/dev/null 2>&1; then
echo "Error: curl is required but not installed"
exit 1
fi
# Start scraping
scrape_website "$URL"
# Download images if requested
if [ "$DOWNLOAD_IMAGES" = true ] && [ -f "$OUTPUT_DIR/all_images.txt" ]; then
log "Downloading images..."
download_images "$OUTPUT_DIR/all_images.txt"
fi
log "Web scraping completed successfully"
echo "Results saved in: $OUTPUT_DIR"These automation examples demonstrate comprehensive solutions for common DevOps and system administration tasks. Each script includes error handling, logging, configuration management, and notification capabilities, making them suitable for production environments.