Health Checking
Comprehensive guide to implementing robust health checks for your services in ScoutQuest.
Overview
Health checks are critical for maintaining service reliability in distributed systems. ScoutQuest provides comprehensive health checking capabilities to ensure only healthy service instances receive traffic.
Health Check Types
- Liveness Checks - Determine if the service is running
- Readiness Checks - Determine if the service is ready to receive traffic
- Startup Checks - Determine if the service has completed initialization
- Custom Checks - Application-specific health validations
Basic Health Check Implementation
Simple HTTP Health Endpoint
const express = require('express');
const { ScoutQuestClient } = require('scoutquest-js');
const app = express();
const client = new ScoutQuestClient({ serverUrl: 'http://localhost:8080' });
// Basic health check
app.get('/health', (req, res) => {
res.status(200).json({
status: 'healthy',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
version: process.env.npm_package_version
});
});
// Readiness check
app.get('/ready', (req, res) => {
// Check if service is ready to serve requests
const isReady = checkDatabaseConnection() && checkExternalAPIs();
if (isReady) {
res.status(200).json({
status: 'ready',
timestamp: new Date().toISOString()
});
} else {
res.status(503).json({
status: 'not ready',
timestamp: new Date().toISOString()
});
}
});
// Register service with health check URL
async function startServer() {
const port = 3000;
app.listen(port, async () => {
console.log(`Server running on port ${port}`);
// Register with ScoutQuest
await client.registerService('user-service', 'localhost', port, {
version: '1.0.0',
environment: 'production'
}, {
healthCheckUrl: `http://localhost:${port}/health`
});
});
}
startServer();
use axum::{
extract::Extension,
http::StatusCode,
response::Json,
routing::get,
Router,
};
use scoutquest_rust::ServiceDiscoveryClient;
use serde_json::{json, Value};
use std::sync::Arc;
use tokio::net::TcpListener;
#[tokio::main]
async fn main() -> Result<(), Box> {
let client = Arc::new(ServiceDiscoveryClient::new("http://localhost:8080")?);
let app = Router::new()
.route("/health", get(health_check))
.route("/ready", get(readiness_check))
.layer(Extension(client.clone()));
let listener = TcpListener::bind("0.0.0.0:3000").await?;
println!("Server running on port 3000");
// Register service with health check
client.register_service_with_health(
"user-service",
"localhost",
3000,
Some([("version".to_string(), "1.0.0".to_string())].into()),
"http://localhost:3000/health"
).await?;
axum::serve(listener, app).await?;
Ok(())
}
async fn health_check() -> Json {
Json(json!({
"status": "healthy",
"timestamp": chrono::Utc::now().to_rfc3339(),
"uptime": std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs(),
"version": env!("CARGO_PKG_VERSION")
}))
}
async fn readiness_check() -> Result, StatusCode> {
// Check dependencies
let database_ready = check_database().await;
let cache_ready = check_cache().await;
if database_ready && cache_ready {
Ok(Json(json!({
"status": "ready",
"timestamp": chrono::Utc::now().to_rfc3339()
})))
} else {
Err(StatusCode::SERVICE_UNAVAILABLE)
}
}
async fn check_database() -> bool {
// Implement database connectivity check
true
}
async fn check_cache() -> bool {
// Implement cache connectivity check
true
}
Advanced Health Checks
Multi-Component Health Checks
// Advanced health check with multiple components
app.get('/health', async (req, res) => {
const healthChecks = {};
let overallHealthy = true;
// Database health check
try {
await db.raw('SELECT 1');
healthChecks.database = {
status: 'healthy',
responseTime: await measureDatabaseResponseTime()
};
} catch (error) {
healthChecks.database = {
status: 'unhealthy',
error: error.message
};
overallHealthy = false;
}
// Redis health check
try {
await redis.ping();
healthChecks.cache = {
status: 'healthy',
responseTime: await measureRedisResponseTime()
};
} catch (error) {
healthChecks.cache = {
status: 'unhealthy',
error: error.message
};
overallHealthy = false;
}
// External API health check
try {
const response = await fetch('https://api.external-service.com/health', {
timeout: 5000
});
healthChecks.externalAPI = {
status: response.ok ? 'healthy' : 'degraded',
responseTime: response.headers.get('X-Response-Time')
};
} catch (error) {
healthChecks.externalAPI = {
status: 'unhealthy',
error: error.message
};
// External API failure doesn't mark service as unhealthy
// overallHealthy = false;
}
// Application-specific checks
healthChecks.application = {
status: 'healthy',
memoryUsage: process.memoryUsage(),
cpuUsage: process.cpuUsage(),
activeConnections: getActiveConnectionCount(),
queueSize: getJobQueueSize()
};
const statusCode = overallHealthy ? 200 : 503;
res.status(statusCode).json({
status: overallHealthy ? 'healthy' : 'unhealthy',
timestamp: new Date().toISOString(),
checks: healthChecks,
uptime: process.uptime(),
version: process.env.npm_package_version
});
});
Health Check Timeouts and Retries
class HealthChecker {
constructor(options = {}) {
this.timeout = options.timeout || 5000;
this.retries = options.retries || 3;
this.retryDelay = options.retryDelay || 1000;
}
async checkWithRetry(checkFunction, componentName) {
for (let attempt = 1; attempt <= this.retries; attempt++) {
try {
const startTime = Date.now();
const result = await Promise.race([
checkFunction(),
new Promise((_, reject) =>
setTimeout(() => reject(new Error('Timeout')), this.timeout)
)
]);
return {
status: 'healthy',
responseTime: Date.now() - startTime,
attempt
};
} catch (error) {
if (attempt === this.retries) {
return {
status: 'unhealthy',
error: error.message,
attempts: attempt
};
}
// Wait before retry
await new Promise(resolve => setTimeout(resolve, this.retryDelay));
}
}
}
async checkDatabase() {
return this.checkWithRetry(
() => db.raw('SELECT 1'),
'database'
);
}
async checkRedis() {
return this.checkWithRetry(
() => redis.ping(),
'cache'
);
}
}
const healthChecker = new HealthChecker({
timeout: 5000,
retries: 3,
retryDelay: 1000
});
app.get('/health', async (req, res) => {
const checks = {};
let overallHealthy = true;
const [databaseCheck, cacheCheck] = await Promise.all([
healthChecker.checkDatabase(),
healthChecker.checkRedis()
]);
checks.database = databaseCheck;
checks.cache = cacheCheck;
if (databaseCheck.status !== 'healthy' || cacheCheck.status !== 'healthy') {
overallHealthy = false;
}
res.status(overallHealthy ? 200 : 503).json({
status: overallHealthy ? 'healthy' : 'unhealthy',
timestamp: new Date().toISOString(),
checks
});
});
ScoutQuest Health Check Configuration
Server-Side Health Check Settings
# config/production.toml
[health_check]
enabled = true
interval = "30s" # How often to check service health
timeout = "5s" # Health check request timeout
retry_count = 3 # Number of retries before marking unhealthy
success_threshold = 2 # Consecutive successes to mark healthy
failure_threshold = 3 # Consecutive failures to mark unhealthy
# Health check HTTP settings
user_agent = "ScoutQuest-HealthChecker/1.0"
follow_redirects = false
expected_status_codes = [200, 204]
# Advanced settings
parallel_checks = true # Run health checks in parallel
max_concurrent_checks = 50 # Maximum concurrent health checks
rate_limit_per_second = 10 # Rate limit health checks per service
# Alerting
alert_on_failure = true
alert_webhook = "https://hooks.slack.com/services/..."
alert_email = "alerts@company.com"
Client-Side Health Check Registration
const client = new ScoutQuestClient({
serverUrl: 'http://localhost:8080'
});
// Register with comprehensive health check configuration
await client.registerService('api-service', 'localhost', 3000, {
version: '1.2.3',
environment: 'production',
team: 'backend'
}, {
// Health check configuration
healthCheck: {
url: 'http://localhost:3000/health',
interval: 30000, // 30 seconds
timeout: 5000, // 5 seconds
retries: 3,
successThreshold: 2,
failureThreshold: 3,
initialDelay: 10000, // Wait 10s before first check
expectedStatusCodes: [200],
expectedResponseBody: { status: 'healthy' }, // Optional
headers: {
'Authorization': 'Bearer health-check-token'
}
},
// Additional endpoints
readinessCheck: {
url: 'http://localhost:3000/ready',
timeout: 3000
},
startupCheck: {
url: 'http://localhost:3000/startup',
timeout: 30000, // Longer timeout for startup
maxAttempts: 10
}
});
use scoutquest_rust::{ServiceDiscoveryClient, HealthCheckConfig};
use std::time::Duration;
use std::collections::HashMap;
let client = ServiceDiscoveryClient::new("http://localhost:8080")?;
let mut headers = HashMap::new();
headers.insert("Authorization".to_string(), "Bearer health-check-token".to_string());
let health_config = HealthCheckConfig::builder()
.url("http://localhost:3000/health")
.interval(Duration::from_secs(30))
.timeout(Duration::from_secs(5))
.retries(3)
.success_threshold(2)
.failure_threshold(3)
.initial_delay(Duration::from_secs(10))
.expected_status_codes(vec![200])
.headers(headers)
.build();
let instance = client.register_service_advanced(
"api-service",
"localhost",
3000,
Some([
("version".to_string(), "1.2.3".to_string()),
("environment".to_string(), "production".to_string()),
].into()),
health_config
).await?;
Health Check Patterns
Graceful Degradation
app.get('/health', async (req, res) => {
const checks = {};
let status = 'healthy';
let statusCode = 200;
// Critical dependencies (must be healthy)
const criticalChecks = await Promise.all([
checkDatabase(),
checkAuthService()
]);
// Non-critical dependencies (can be degraded)
const nonCriticalChecks = await Promise.all([
checkRecommendationService(),
checkAnalyticsService(),
checkNotificationService()
]);
// Evaluate critical dependencies
const criticalFailures = criticalChecks.filter(check => check.status !== 'healthy');
if (criticalFailures.length > 0) {
status = 'unhealthy';
statusCode = 503;
}
// Evaluate non-critical dependencies
const nonCriticalFailures = nonCriticalChecks.filter(check => check.status !== 'healthy');
if (nonCriticalFailures.length > 0 && status === 'healthy') {
status = 'degraded';
statusCode = 200; // Still accept traffic but with reduced functionality
}
res.status(statusCode).json({
status,
timestamp: new Date().toISOString(),
checks: {
critical: Object.fromEntries(
['database', 'auth'].map((name, i) => [name, criticalChecks[i]])
),
nonCritical: Object.fromEntries(
['recommendations', 'analytics', 'notifications'].map((name, i) => [name, nonCriticalChecks[i]])
)
}
});
});
Circuit Breaker Integration
class CircuitBreakerHealthCheck {
constructor(options = {}) {
this.failureThreshold = options.failureThreshold || 5;
this.recoveryTimeout = options.recoveryTimeout || 60000;
this.state = 'CLOSED'; // CLOSED, OPEN, HALF_OPEN
this.failures = 0;
this.lastFailureTime = null;
}
async checkHealth(healthCheckFunction) {
if (this.state === 'OPEN') {
// Check if enough time has passed to try again
if (Date.now() - this.lastFailureTime > this.recoveryTimeout) {
this.state = 'HALF_OPEN';
} else {
return {
status: 'circuit_open',
message: 'Circuit breaker is open',
state: this.state
};
}
}
try {
const result = await healthCheckFunction();
if (this.state === 'HALF_OPEN') {
// Success in half-open state closes the circuit
this.state = 'CLOSED';
this.failures = 0;
}
return {
status: 'healthy',
...result,
circuitState: this.state
};
} catch (error) {
this.failures++;
this.lastFailureTime = Date.now();
if (this.failures >= this.failureThreshold) {
this.state = 'OPEN';
}
return {
status: 'unhealthy',
error: error.message,
circuitState: this.state,
failures: this.failures
};
}
}
}
const dbCircuitBreaker = new CircuitBreakerHealthCheck({
failureThreshold: 5,
recoveryTimeout: 60000
});
app.get('/health', async (req, res) => {
const databaseHealth = await dbCircuitBreaker.checkHealth(
() => db.raw('SELECT 1')
);
const overallHealthy = databaseHealth.status === 'healthy';
res.status(overallHealthy ? 200 : 503).json({
status: overallHealthy ? 'healthy' : 'unhealthy',
timestamp: new Date().toISOString(),
checks: {
database: databaseHealth
}
});
});
Monitoring Health Checks
Health Check Metrics
const prometheus = require('prom-client');
// Health check metrics
const healthCheckDuration = new prometheus.Histogram({
name: 'health_check_duration_seconds',
help: 'Duration of health check requests',
labelNames: ['component', 'status']
});
const healthCheckTotal = new prometheus.Counter({
name: 'health_check_total',
help: 'Total number of health checks',
labelNames: ['component', 'status']
});
const healthCheckStatus = new prometheus.Gauge({
name: 'health_check_status',
help: 'Health check status (1 = healthy, 0 = unhealthy)',
labelNames: ['component']
});
async function instrumentedHealthCheck(component, checkFunction) {
const startTime = Date.now();
try {
const result = await checkFunction();
const duration = (Date.now() - startTime) / 1000;
healthCheckDuration.labels(component, 'success').observe(duration);
healthCheckTotal.labels(component, 'success').inc();
healthCheckStatus.labels(component).set(1);
return { status: 'healthy', responseTime: duration * 1000 };
} catch (error) {
const duration = (Date.now() - startTime) / 1000;
healthCheckDuration.labels(component, 'failure').observe(duration);
healthCheckTotal.labels(component, 'failure').inc();
healthCheckStatus.labels(component).set(0);
return { status: 'unhealthy', error: error.message, responseTime: duration * 1000 };
}
}
app.get('/health', async (req, res) => {
const [databaseCheck, cacheCheck] = await Promise.all([
instrumentedHealthCheck('database', () => db.raw('SELECT 1')),
instrumentedHealthCheck('cache', () => redis.ping())
]);
const overallHealthy = databaseCheck.status === 'healthy' && cacheCheck.status === 'healthy';
res.status(overallHealthy ? 200 : 503).json({
status: overallHealthy ? 'healthy' : 'unhealthy',
timestamp: new Date().toISOString(),
checks: {
database: databaseCheck,
cache: cacheCheck
}
});
});
Health Check Alerting
class HealthCheckAlerting {
constructor(options = {}) {
this.alertThreshold = options.alertThreshold || 3;
this.recoveryThreshold = options.recoveryThreshold || 2;
this.alertWebhook = options.alertWebhook;
this.componentStates = new Map();
}
async processHealthCheck(component, healthResult) {
const currentState = this.componentStates.get(component) || {
consecutiveFailures: 0,
consecutiveSuccesses: 0,
alertSent: false
};
if (healthResult.status === 'healthy') {
currentState.consecutiveSuccesses++;
currentState.consecutiveFailures = 0;
// Send recovery alert
if (currentState.alertSent && currentState.consecutiveSuccesses >= this.recoveryThreshold) {
await this.sendRecoveryAlert(component, healthResult);
currentState.alertSent = false;
}
} else {
currentState.consecutiveFailures++;
currentState.consecutiveSuccesses = 0;
// Send failure alert
if (!currentState.alertSent && currentState.consecutiveFailures >= this.alertThreshold) {
await this.sendFailureAlert(component, healthResult);
currentState.alertSent = true;
}
}
this.componentStates.set(component, currentState);
}
async sendFailureAlert(component, healthResult) {
const alert = {
type: 'health_check_failure',
component,
status: healthResult.status,
error: healthResult.error,
timestamp: new Date().toISOString(),
severity: 'critical'
};
if (this.alertWebhook) {
try {
await fetch(this.alertWebhook, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(alert)
});
} catch (error) {
console.error('Failed to send alert:', error);
}
}
console.error(`ALERT: ${component} health check failed`, alert);
}
async sendRecoveryAlert(component, healthResult) {
const alert = {
type: 'health_check_recovery',
component,
status: healthResult.status,
timestamp: new Date().toISOString(),
severity: 'info'
};
if (this.alertWebhook) {
try {
await fetch(this.alertWebhook, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(alert)
});
} catch (error) {
console.error('Failed to send recovery alert:', error);
}
}
console.info(`RECOVERY: ${component} health check recovered`, alert);
}
}
const alerting = new HealthCheckAlerting({
alertThreshold: 3,
recoveryThreshold: 2,
alertWebhook: process.env.ALERT_WEBHOOK_URL
});
// Use in health check
app.get('/health', async (req, res) => {
const databaseCheck = await instrumentedHealthCheck('database', () => db.raw('SELECT 1'));
const cacheCheck = await instrumentedHealthCheck('cache', () => redis.ping());
// Process for alerting
await alerting.processHealthCheck('database', databaseCheck);
await alerting.processHealthCheck('cache', cacheCheck);
// Rest of health check logic...
});
Best Practices
✅ Health Check Best Practices
- Keep it fast: Health checks should complete quickly (< 5 seconds)
- Check dependencies: Verify critical external dependencies
- Use appropriate status codes: 200 for healthy, 503 for unhealthy
- Include useful information: Response times, error details, component status
- Implement graceful degradation: Distinguish between critical and non-critical failures
- Monitor health check performance: Track metrics and set up alerts
- Test failure scenarios: Regularly test how your service behaves when dependencies fail
⚠️ Common Pitfalls
- Cascading failures: Don't make health checks depend on other services' health checks
- Heavy operations: Avoid expensive operations in health checks
- False positives: Ensure health checks accurately reflect service health
- Resource exhaustion: Health checks shouldn't consume significant resources
- Single points of failure: Don't rely on a single check for critical dependencies
💡 Advanced Tips
- Implement startup checks: Separate checks for service initialization
- Use circuit breakers: Protect health checks from cascading failures
- Add health check versioning: Allow evolution of health check contracts
- Include business logic checks: Verify core application functionality
- Implement health check discovery: Let services advertise their health endpoints