Health Checking - ScoutQuest Documentation

Overview

Health checks are critical for maintaining service reliability in distributed systems. ScoutQuest provides comprehensive health checking capabilities to ensure only healthy service instances receive traffic.

Health Check Types

Liveness Checks - Determine if the service is running
Readiness Checks - Determine if the service is ready to receive traffic
Startup Checks - Determine if the service has completed initialization
Custom Checks - Application-specific health validations

Basic Health Check Implementation

Simple HTTP Health Endpoint

const express = require('express');
const { ScoutQuestClient } = require('scoutquest-js');

const app = express();
const client = new ScoutQuestClient({ serverUrl: 'http://localhost:8080' });

// Basic health check
app.get('/health', (req, res) => {
    res.status(200).json({
        status: 'healthy',
        timestamp: new Date().toISOString(),
        uptime: process.uptime(),
        version: process.env.npm_package_version
    });
});

// Readiness check
app.get('/ready', (req, res) => {
    // Check if service is ready to serve requests
    const isReady = checkDatabaseConnection() && checkExternalAPIs();

    if (isReady) {
        res.status(200).json({
            status: 'ready',
            timestamp: new Date().toISOString()
        });
    } else {
        res.status(503).json({
            status: 'not ready',
            timestamp: new Date().toISOString()
        });
    }
});

// Register service with health check URL
async function startServer() {
    const port = 3000;
    app.listen(port, async () => {
        console.log(`Server running on port ${port}`);

        // Register with ScoutQuest
        await client.registerService('user-service', 'localhost', port, {
            version: '1.0.0',
            environment: 'production'
        }, {
            healthCheckUrl: `http://localhost:${port}/health`
        });
    });
}

startServer();

use axum::{
    extract::Extension,
    http::StatusCode,
    response::Json,
    routing::get,
    Router,
};
use scoutquest_rust::ServiceDiscoveryClient;
use serde_json::{json, Value};
use std::sync::Arc;
use tokio::net::TcpListener;

#[tokio::main]
async fn main() -> Result<(), Box> {
    let client = Arc::new(ServiceDiscoveryClient::new("http://localhost:8080")?);

    let app = Router::new()
        .route("/health", get(health_check))
        .route("/ready", get(readiness_check))
        .layer(Extension(client.clone()));

    let listener = TcpListener::bind("0.0.0.0:3000").await?;
    println!("Server running on port 3000");

    // Register service with health check
    client.register_service_with_health(
        "user-service",
        "localhost",
        3000,
        Some([("version".to_string(), "1.0.0".to_string())].into()),
        "http://localhost:3000/health"
    ).await?;

    axum::serve(listener, app).await?;
    Ok(())
}

async fn health_check() -> Json {
    Json(json!({
        "status": "healthy",
        "timestamp": chrono::Utc::now().to_rfc3339(),
        "uptime": std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap()
            .as_secs(),
        "version": env!("CARGO_PKG_VERSION")
    }))
}

async fn readiness_check() -> Result, StatusCode> {
    // Check dependencies
    let database_ready = check_database().await;
    let cache_ready = check_cache().await;

    if database_ready && cache_ready {
        Ok(Json(json!({
            "status": "ready",
            "timestamp": chrono::Utc::now().to_rfc3339()
        })))
    } else {
        Err(StatusCode::SERVICE_UNAVAILABLE)
    }
}

async fn check_database() -> bool {
    // Implement database connectivity check
    true
}

async fn check_cache() -> bool {
    // Implement cache connectivity check
    true
}

Advanced Health Checks

Multi-Component Health Checks

// Advanced health check with multiple components
app.get('/health', async (req, res) => {
    const healthChecks = {};
    let overallHealthy = true;

    // Database health check
    try {
        await db.raw('SELECT 1');
        healthChecks.database = {
            status: 'healthy',
            responseTime: await measureDatabaseResponseTime()
        };
    } catch (error) {
        healthChecks.database = {
            status: 'unhealthy',
            error: error.message
        };
        overallHealthy = false;
    }

    // Redis health check
    try {
        await redis.ping();
        healthChecks.cache = {
            status: 'healthy',
            responseTime: await measureRedisResponseTime()
        };
    } catch (error) {
        healthChecks.cache = {
            status: 'unhealthy',
            error: error.message
        };
        overallHealthy = false;
    }

    // External API health check
    try {
        const response = await fetch('https://api.external-service.com/health', {
            timeout: 5000
        });
        healthChecks.externalAPI = {
            status: response.ok ? 'healthy' : 'degraded',
            responseTime: response.headers.get('X-Response-Time')
        };
    } catch (error) {
        healthChecks.externalAPI = {
            status: 'unhealthy',
            error: error.message
        };
        // External API failure doesn't mark service as unhealthy
        // overallHealthy = false;
    }

    // Application-specific checks
    healthChecks.application = {
        status: 'healthy',
        memoryUsage: process.memoryUsage(),
        cpuUsage: process.cpuUsage(),
        activeConnections: getActiveConnectionCount(),
        queueSize: getJobQueueSize()
    };

    const statusCode = overallHealthy ? 200 : 503;
    res.status(statusCode).json({
        status: overallHealthy ? 'healthy' : 'unhealthy',
        timestamp: new Date().toISOString(),
        checks: healthChecks,
        uptime: process.uptime(),
        version: process.env.npm_package_version
    });
});

Health Check Timeouts and Retries

class HealthChecker {
    constructor(options = {}) {
        this.timeout = options.timeout || 5000;
        this.retries = options.retries || 3;
        this.retryDelay = options.retryDelay || 1000;
    }

    async checkWithRetry(checkFunction, componentName) {
        for (let attempt = 1; attempt <= this.retries; attempt++) {
            try {
                const startTime = Date.now();
                const result = await Promise.race([
                    checkFunction(),
                    new Promise((_, reject) =>
                        setTimeout(() => reject(new Error('Timeout')), this.timeout)
                    )
                ]);

                return {
                    status: 'healthy',
                    responseTime: Date.now() - startTime,
                    attempt
                };
            } catch (error) {
                if (attempt === this.retries) {
                    return {
                        status: 'unhealthy',
                        error: error.message,
                        attempts: attempt
                    };
                }

                // Wait before retry
                await new Promise(resolve => setTimeout(resolve, this.retryDelay));
            }
        }
    }

    async checkDatabase() {
        return this.checkWithRetry(
            () => db.raw('SELECT 1'),
            'database'
        );
    }

    async checkRedis() {
        return this.checkWithRetry(
            () => redis.ping(),
            'cache'
        );
    }
}

const healthChecker = new HealthChecker({
    timeout: 5000,
    retries: 3,
    retryDelay: 1000
});

app.get('/health', async (req, res) => {
    const checks = {};
    let overallHealthy = true;

    const [databaseCheck, cacheCheck] = await Promise.all([
        healthChecker.checkDatabase(),
        healthChecker.checkRedis()
    ]);

    checks.database = databaseCheck;
    checks.cache = cacheCheck;

    if (databaseCheck.status !== 'healthy' || cacheCheck.status !== 'healthy') {
        overallHealthy = false;
    }

    res.status(overallHealthy ? 200 : 503).json({
        status: overallHealthy ? 'healthy' : 'unhealthy',
        timestamp: new Date().toISOString(),
        checks
    });
});

ScoutQuest Health Check Configuration

Server-Side Health Check Settings

# config/production.toml
[health_check]
enabled = true
interval = "30s"           # How often to check service health
timeout = "5s"             # Health check request timeout
retry_count = 3            # Number of retries before marking unhealthy
success_threshold = 2      # Consecutive successes to mark healthy
failure_threshold = 3      # Consecutive failures to mark unhealthy

# Health check HTTP settings
user_agent = "ScoutQuest-HealthChecker/1.0"
follow_redirects = false
expected_status_codes = [200, 204]

# Advanced settings
parallel_checks = true     # Run health checks in parallel
max_concurrent_checks = 50 # Maximum concurrent health checks
rate_limit_per_second = 10 # Rate limit health checks per service

# Alerting
alert_on_failure = true
alert_webhook = "https://hooks.slack.com/services/..."
alert_email = "alerts@company.com"

Client-Side Health Check Registration

const client = new ScoutQuestClient({
    serverUrl: 'http://localhost:8080'
});

// Register with comprehensive health check configuration
await client.registerService('api-service', 'localhost', 3000, {
    version: '1.2.3',
    environment: 'production',
    team: 'backend'
}, {
    // Health check configuration
    healthCheck: {
        url: 'http://localhost:3000/health',
        interval: 30000,     // 30 seconds
        timeout: 5000,       // 5 seconds
        retries: 3,
        successThreshold: 2,
        failureThreshold: 3,
        initialDelay: 10000, // Wait 10s before first check
        expectedStatusCodes: [200],
        expectedResponseBody: { status: 'healthy' }, // Optional
        headers: {
            'Authorization': 'Bearer health-check-token'
        }
    },

    // Additional endpoints
    readinessCheck: {
        url: 'http://localhost:3000/ready',
        timeout: 3000
    },

    startupCheck: {
        url: 'http://localhost:3000/startup',
        timeout: 30000,      // Longer timeout for startup
        maxAttempts: 10
    }
});

use scoutquest_rust::{ServiceDiscoveryClient, HealthCheckConfig};
use std::time::Duration;
use std::collections::HashMap;

let client = ServiceDiscoveryClient::new("http://localhost:8080")?;

let mut headers = HashMap::new();
headers.insert("Authorization".to_string(), "Bearer health-check-token".to_string());

let health_config = HealthCheckConfig::builder()
    .url("http://localhost:3000/health")
    .interval(Duration::from_secs(30))
    .timeout(Duration::from_secs(5))
    .retries(3)
    .success_threshold(2)
    .failure_threshold(3)
    .initial_delay(Duration::from_secs(10))
    .expected_status_codes(vec![200])
    .headers(headers)
    .build();

let instance = client.register_service_advanced(
    "api-service",
    "localhost",
    3000,
    Some([
        ("version".to_string(), "1.2.3".to_string()),
        ("environment".to_string(), "production".to_string()),
    ].into()),
    health_config
).await?;

Health Check Patterns

Graceful Degradation

app.get('/health', async (req, res) => {
    const checks = {};
    let status = 'healthy';
    let statusCode = 200;

    // Critical dependencies (must be healthy)
    const criticalChecks = await Promise.all([
        checkDatabase(),
        checkAuthService()
    ]);

    // Non-critical dependencies (can be degraded)
    const nonCriticalChecks = await Promise.all([
        checkRecommendationService(),
        checkAnalyticsService(),
        checkNotificationService()
    ]);

    // Evaluate critical dependencies
    const criticalFailures = criticalChecks.filter(check => check.status !== 'healthy');
    if (criticalFailures.length > 0) {
        status = 'unhealthy';
        statusCode = 503;
    }

    // Evaluate non-critical dependencies
    const nonCriticalFailures = nonCriticalChecks.filter(check => check.status !== 'healthy');
    if (nonCriticalFailures.length > 0 && status === 'healthy') {
        status = 'degraded';
        statusCode = 200; // Still accept traffic but with reduced functionality
    }

    res.status(statusCode).json({
        status,
        timestamp: new Date().toISOString(),
        checks: {
            critical: Object.fromEntries(
                ['database', 'auth'].map((name, i) => [name, criticalChecks[i]])
            ),
            nonCritical: Object.fromEntries(
                ['recommendations', 'analytics', 'notifications'].map((name, i) => [name, nonCriticalChecks[i]])
            )
        }
    });
});

Circuit Breaker Integration

class CircuitBreakerHealthCheck {
    constructor(options = {}) {
        this.failureThreshold = options.failureThreshold || 5;
        this.recoveryTimeout = options.recoveryTimeout || 60000;
        this.state = 'CLOSED'; // CLOSED, OPEN, HALF_OPEN
        this.failures = 0;
        this.lastFailureTime = null;
    }

    async checkHealth(healthCheckFunction) {
        if (this.state === 'OPEN') {
            // Check if enough time has passed to try again
            if (Date.now() - this.lastFailureTime > this.recoveryTimeout) {
                this.state = 'HALF_OPEN';
            } else {
                return {
                    status: 'circuit_open',
                    message: 'Circuit breaker is open',
                    state: this.state
                };
            }
        }

        try {
            const result = await healthCheckFunction();

            if (this.state === 'HALF_OPEN') {
                // Success in half-open state closes the circuit
                this.state = 'CLOSED';
                this.failures = 0;
            }

            return {
                status: 'healthy',
                ...result,
                circuitState: this.state
            };
        } catch (error) {
            this.failures++;
            this.lastFailureTime = Date.now();

            if (this.failures >= this.failureThreshold) {
                this.state = 'OPEN';
            }

            return {
                status: 'unhealthy',
                error: error.message,
                circuitState: this.state,
                failures: this.failures
            };
        }
    }
}

const dbCircuitBreaker = new CircuitBreakerHealthCheck({
    failureThreshold: 5,
    recoveryTimeout: 60000
});

app.get('/health', async (req, res) => {
    const databaseHealth = await dbCircuitBreaker.checkHealth(
        () => db.raw('SELECT 1')
    );

    const overallHealthy = databaseHealth.status === 'healthy';

    res.status(overallHealthy ? 200 : 503).json({
        status: overallHealthy ? 'healthy' : 'unhealthy',
        timestamp: new Date().toISOString(),
        checks: {
            database: databaseHealth
        }
    });
});

Monitoring Health Checks

Health Check Metrics

const prometheus = require('prom-client');

// Health check metrics
const healthCheckDuration = new prometheus.Histogram({
    name: 'health_check_duration_seconds',
    help: 'Duration of health check requests',
    labelNames: ['component', 'status']
});

const healthCheckTotal = new prometheus.Counter({
    name: 'health_check_total',
    help: 'Total number of health checks',
    labelNames: ['component', 'status']
});

const healthCheckStatus = new prometheus.Gauge({
    name: 'health_check_status',
    help: 'Health check status (1 = healthy, 0 = unhealthy)',
    labelNames: ['component']
});

async function instrumentedHealthCheck(component, checkFunction) {
    const startTime = Date.now();

    try {
        const result = await checkFunction();
        const duration = (Date.now() - startTime) / 1000;

        healthCheckDuration.labels(component, 'success').observe(duration);
        healthCheckTotal.labels(component, 'success').inc();
        healthCheckStatus.labels(component).set(1);

        return { status: 'healthy', responseTime: duration * 1000 };
    } catch (error) {
        const duration = (Date.now() - startTime) / 1000;

        healthCheckDuration.labels(component, 'failure').observe(duration);
        healthCheckTotal.labels(component, 'failure').inc();
        healthCheckStatus.labels(component).set(0);

        return { status: 'unhealthy', error: error.message, responseTime: duration * 1000 };
    }
}

app.get('/health', async (req, res) => {
    const [databaseCheck, cacheCheck] = await Promise.all([
        instrumentedHealthCheck('database', () => db.raw('SELECT 1')),
        instrumentedHealthCheck('cache', () => redis.ping())
    ]);

    const overallHealthy = databaseCheck.status === 'healthy' && cacheCheck.status === 'healthy';

    res.status(overallHealthy ? 200 : 503).json({
        status: overallHealthy ? 'healthy' : 'unhealthy',
        timestamp: new Date().toISOString(),
        checks: {
            database: databaseCheck,
            cache: cacheCheck
        }
    });
});

Health Check Alerting

class HealthCheckAlerting {
    constructor(options = {}) {
        this.alertThreshold = options.alertThreshold || 3;
        this.recoveryThreshold = options.recoveryThreshold || 2;
        this.alertWebhook = options.alertWebhook;
        this.componentStates = new Map();
    }

    async processHealthCheck(component, healthResult) {
        const currentState = this.componentStates.get(component) || {
            consecutiveFailures: 0,
            consecutiveSuccesses: 0,
            alertSent: false
        };

        if (healthResult.status === 'healthy') {
            currentState.consecutiveSuccesses++;
            currentState.consecutiveFailures = 0;

            // Send recovery alert
            if (currentState.alertSent && currentState.consecutiveSuccesses >= this.recoveryThreshold) {
                await this.sendRecoveryAlert(component, healthResult);
                currentState.alertSent = false;
            }
        } else {
            currentState.consecutiveFailures++;
            currentState.consecutiveSuccesses = 0;

            // Send failure alert
            if (!currentState.alertSent && currentState.consecutiveFailures >= this.alertThreshold) {
                await this.sendFailureAlert(component, healthResult);
                currentState.alertSent = true;
            }
        }

        this.componentStates.set(component, currentState);
    }

    async sendFailureAlert(component, healthResult) {
        const alert = {
            type: 'health_check_failure',
            component,
            status: healthResult.status,
            error: healthResult.error,
            timestamp: new Date().toISOString(),
            severity: 'critical'
        };

        if (this.alertWebhook) {
            try {
                await fetch(this.alertWebhook, {
                    method: 'POST',
                    headers: { 'Content-Type': 'application/json' },
                    body: JSON.stringify(alert)
                });
            } catch (error) {
                console.error('Failed to send alert:', error);
            }
        }

        console.error(`ALERT: ${component} health check failed`, alert);
    }

    async sendRecoveryAlert(component, healthResult) {
        const alert = {
            type: 'health_check_recovery',
            component,
            status: healthResult.status,
            timestamp: new Date().toISOString(),
            severity: 'info'
        };

        if (this.alertWebhook) {
            try {
                await fetch(this.alertWebhook, {
                    method: 'POST',
                    headers: { 'Content-Type': 'application/json' },
                    body: JSON.stringify(alert)
                });
            } catch (error) {
                console.error('Failed to send recovery alert:', error);
            }
        }

        console.info(`RECOVERY: ${component} health check recovered`, alert);
    }
}

const alerting = new HealthCheckAlerting({
    alertThreshold: 3,
    recoveryThreshold: 2,
    alertWebhook: process.env.ALERT_WEBHOOK_URL
});

// Use in health check
app.get('/health', async (req, res) => {
    const databaseCheck = await instrumentedHealthCheck('database', () => db.raw('SELECT 1'));
    const cacheCheck = await instrumentedHealthCheck('cache', () => redis.ping());

    // Process for alerting
    await alerting.processHealthCheck('database', databaseCheck);
    await alerting.processHealthCheck('cache', cacheCheck);

    // Rest of health check logic...
});

Best Practices

✅ Health Check Best Practices

Keep it fast: Health checks should complete quickly (< 5 seconds)
Check dependencies: Verify critical external dependencies
Use appropriate status codes: 200 for healthy, 503 for unhealthy
Include useful information: Response times, error details, component status
Implement graceful degradation: Distinguish between critical and non-critical failures
Monitor health check performance: Track metrics and set up alerts
Test failure scenarios: Regularly test how your service behaves when dependencies fail

⚠️ Common Pitfalls

Cascading failures: Don't make health checks depend on other services' health checks
Heavy operations: Avoid expensive operations in health checks
False positives: Ensure health checks accurately reflect service health
Resource exhaustion: Health checks shouldn't consume significant resources
Single points of failure: Don't rely on a single check for critical dependencies

💡 Advanced Tips

Implement startup checks: Separate checks for service initialization
Use circuit breakers: Protect health checks from cascading failures
Add health check versioning: Allow evolution of health check contracts
Include business logic checks: Verify core application functionality
Implement health check discovery: Let services advertise their health endpoints

Next Steps

Monitoring & Observability

Learn how to monitor your ScoutQuest deployment with metrics, logging, and tracing

Production Deployment

Deploy ScoutQuest in production with high availability and security